Actualizar README.md

2024-04-25 16:04:56 -05:00
42 changed files with 128 additions and 17699 deletions
--- a/.streamlit/config.toml
+++ b/.streamlit/config.toml
@ -1,5 +1,5 @@
 [theme]
 base = "dark"
-backgroundColor="#3f51b5"
+backgroundColor = "#0d2669"
-secondaryBackgroundColor="#2b2b4e"
+secondaryBackgroundColor = "#050550"
 font = "monospace"
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
 # chat_pdf
-El siguiente proyecto construye un chatbot que usa un LLM, Langchain y streamlit para crear una aplicacion que permite chatear con uun pdf. 
+El siguiente proyecto construye un chatbot que usa un LLM, Langchain y streamlit para crear una aplicacion que permite chatear con un pdf. 
--- a/app.py
+++ b/app.py
@ -1,40 +1,43 @@
 # import os
 # from dotenv import load_dotenv
 # from langchain_community.chat_models import ChatOpenAI
 import streamlit as st
-from chats.streamlit_tools import import_file  # ,clear_cache
+import os
 from dotenv import load_dotenv
 from langchain_community.chat_models import ChatOpenAI
 from chats.streamlit_tools import import_file, clear_cache
 from streamlit_extras.add_vertical_space import add_vertical_space
 from langchain_tools.pdf_tools import PdfLangChain
 from langchain_tools.lc_tools import LangChainTools
 from chats.chat_tools import MessageManager
 # App title
-st.set_page_config(page_title="LLMOneClusterTeam")
+st.set_page_config(page_title="Snowflake Arctic")
 # sidebar
 with st.sidebar:
    # Cargar el logo (asegúrate de que el archivo de imagen esté en la misma carpeta que tu script)
    logo_path = "documents/Logo azulblanco.png"
    # Ajusta el ancho según sea necesario
    logo = st.sidebar.image(logo_path, width=200)
-    # Ajusta el ancho según sea necesario
+    add_vertical_space(18)
    add_vertical_space(28)
    # pdf_name = import_file()
    st.markdown("Built by [OneCluster](https://www.onecluster.org/).")
 col1, col2 = st.columns([1.1, 1])
 with col1:
    st.title(
        "DocumentAssist",
    )
 with col2:
    logo_2 = st.image("documents/pdfs/logo_1-removebg-preview.png", width=110)
    pdf_name = import_file()
    # Crea un botón en Streamlit que llama a la función clear_cache() cuando se presiona
    if st.button('Eliminar caché'):
        clear_cache()
    if st.button('Reiniciar'):
        st.experimental_rerun()
    st.markdown(
        "Built by [OneCluster](https://www.onecluster.org/)."
    )
 st.title('💬📄 LLM CHat APP')
 if pdf_name:
    with st.spinner("Processing the document..."):
        # Inicializamos la clase PdfLangChain
        pdfLangChain = PdfLangChain(pdf_name)
        pdf_name = pdfLangChain.file_name
@ -51,29 +54,21 @@ if pdf_name:
        # Cargamos el modelo de embeddings
        embedding_model = langChainTools.load_embedding_opnai()
        # Cargamos el modelo de embeddings
        # embedding_model = langChainTools.load_embedding_hf()
        # Creamos el vector store
        docstorage = langChainTools.create_vector_strore(
-            docs_split, pdf_name, embedding_model
+            docs_split,
-        )
+            pdf_name,
            embedding_model)
    # Cargamos el modelo LLM desde LangChain
-        # llm = langChainTools.load_llm_open_source()
+    llm = langChainTools.load_llm_openai()
        # Cargamos el modelo LLm de Ollama
        llm = langChainTools.load_llm_ollama()
    # Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
    # Para este caso la cadena tene el parametro de memoria.
-
+    qa = langChainTools.define_retrieval_qa_memory(
-        if "qa" not in st.session_state.keys():
+        llm, docstorage,
-            st.session_state.qa = langChainTools.define_retrieval_qa_memory(
+        pdf_name,
-                llm, pdf_name, embedding_model
+        embedding_model)
            )
        # qa = langChainTools.define_retrieval_qa_memory(llm, pdf_name, embedding_model)
    # Store conversation history
    if "messages" not in st.session_state.keys():
@ -115,30 +110,23 @@ if pdf_name:
        # Generate a new response if last message is not from assistant
        if st.session_state.messages[-1]["role"] != "assistant":
            with st.spinner("Thinking..."):
                # Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
                # Para este caso la cadena tene el parametro de memoria.
-                # qa = langChainTools.define_retrieval_qa_memory(
+                qa = langChainTools.define_retrieval_qa_memory(
-                #    llm, docstorage, pdf_name, embedding_model
+                    llm, docstorage,
-                # )
+                    pdf_name,
                    embedding_model)
-                input = "\n".join([msg["content"] for msg in st.session_state.messages])
+                input = "\n".join([msg["content"]
                                   for msg in st.session_state.messages])
-                query = st.session_state.qa.invoke(
+                query = qa.invoke({"question": f"{input}"},
-                    {"question": f"{prompt}"}, return_only_outputs=True
+                                  return_only_outputs=True)
                )
-                response_text = query["answer"]
+                response = query["answer"]
                documents_source = query["source_documents"]
                messageManager = MessageManager()
                citation: str = messageManager.generate_citations(documents_source)
                # st.markdown(citation)
            with st.chat_message("assistant"):
-                st.write(response_text)
+                st.write(response)
                st.session_state.messages.append(
-                    {"role": "assistant", "content": response_text}
+                    {"role": "assistant", "content": response})
                )
                expander = st.expander("Fuentes")
                expander.markdown(citation)
--- a/app_2.py
+++ b/app_2.py
@ -1,143 +0,0 @@
 # import os
 # from dotenv import load_dotenv
 # from langchain_community.chat_models import ChatOpenAI
 import streamlit as st
 from chats.streamlit_tools import import_file  # ,clear_cache
 from streamlit_extras.add_vertical_space import add_vertical_space
 from langchain_tools.pdf_tools import PdfLangChain
 from langchain_tools.lc_tools import LangChainTools
 from chats.chat_tools import MessageManager
 from langchain_community.llms import Ollama
 # App title
 st.set_page_config(page_title="LLMOneClusterTeam")
 # sidebar
 with st.sidebar:
    # Cargar el logo (asegúrate de que el archivo de imagen esté en la misma carpeta que tu script)
    logo_path = "documents/Logo azulblanco.png"
    logo = st.sidebar.image(logo_path, width=200)
    # Ajusta el ancho según sea necesario
    add_vertical_space(28)
    # pdf_name = import_file()
    st.markdown("Built by [OneCluster](https://www.onecluster.org/).")
 col1, col2 = st.columns([1.1, 1])
 with col1:
    st.title(
        "DocumentAssist",
    )
 with col2:
    logo_2 = st.image("documents/pdfs/logo_1-removebg-preview.png", width=110)
 pdf_name = import_file()
 if pdf_name:
    with st.spinner("Processing the document..."):
        # Inicializamos la clase PdfLangChain
        pdfLangChain = PdfLangChain(pdf_name)
        pdf_name = pdfLangChain.file_name
        # Cargamos el documento PDF
        docs: list = pdfLangChain.load_pdf()
        # Dividimos los documentos en partes mas pequenas
        docs_split: list = pdfLangChain.split_docs(docs)
        # Instanciamos la clase LangChainTools que contiene herramientras LangChain
        langChainTools = LangChainTools()
        # Cargamos el modelo de embeddings
        embedding_model = langChainTools.load_embedding_opnai()
        # Cargamos el modelo de embeddings
        # embedding_model = langChainTools.load_embedding_hf()
        # Creamos el vector store
        docstorage = langChainTools.create_vector_strore(
            docs_split, pdf_name, embedding_model
        )
    # Cargamos el modelo LLM desde LangChain
    # llm = langChainTools.load_llm_open_source()
    # Cargamos el modelo LLM desde LangChain
    llm = langChainTools.load_llm_openai()
    # Cargamos el modelo desde Ollama
    # llm = Ollama(model="gemma")
    # Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
    # Para este caso la cadena tene el parametro de memoria.
    qa = langChainTools.define_retrieval_qa_memory(
        llm, docstorage, pdf_name, embedding_model
    )
    # Store conversation history
    if "messages" not in st.session_state.keys():
        st.session_state.messages = [
            {
                "role": "assistant",
                "content": "Hola, soy una IA con el que puedes chatear con tu PDF. Haz un pregunta al documento.",
            }
        ]
    # Display or clear chat messages
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.write(message["content"])
    def clear_chat_history():
        st.session_state.messages = [
            {
                "role": "assistant",
                "content": "Hola, soy una IA con el que puedes chatear con tu PDF. Haz un pregunta al documento.",
            }
        ]
    st.sidebar.button("Clear chat history", on_click=clear_chat_history)
    @st.cache_resource
    def get_num_tokens(prompt):
        """Get the number of tokens in a given prompt"""
        return len(prompt.split())
    # Function for generating Snowflake Arctic response
    # User-provided prompt
    if prompt := st.chat_input():
        st.session_state.messages.append({"role": "user", "content": prompt})
        with st.chat_message("user"):
            st.write(prompt)
        # Generate a new response if last message is not from assistant
        if st.session_state.messages[-1]["role"] != "assistant":
            with st.spinner("Thinking..."):
                # Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.
                # Para este caso la cadena tene el parametro de memoria.
                qa = langChainTools.define_retrieval_qa_memory(
                    llm, docstorage, pdf_name, embedding_model
                )
                input = "\n".join([msg["content"] for msg in st.session_state.messages])
                query = qa.invoke({"question": f"{prompt}"}, return_only_outputs=True)
                response_text = query["answer"]
                documents_source = query["source_documents"]
                messageManager = MessageManager()
                citation: str = messageManager.generate_citations(documents_source)
                # st.markdown(citation)
            with st.chat_message("assistant"):
                st.write(response_text)
                st.session_state.messages.append(
                    {"role": "assistant", "content": response_text}
                )
                expander = st.expander("Fuentes")
                expander.markdown(citation)
--- a/chats/pycache/init.cpython-311.pyc
+++ b/chats/pycache/init.cpython-311.pyc
--- a/chats/pycache/chat_tools.cpython-311.pyc
+++ b/chats/pycache/chat_tools.cpython-311.pyc
--- a/chats/pycache/streamlit_tools.cpython-311.pyc
+++ b/chats/pycache/streamlit_tools.cpython-311.pyc
--- a/chats/chat_tools.py
+++ b/chats/chat_tools.py
@ -2,51 +2,27 @@ from colorama import Fore, Back, Style
 class MessageManager:
    def create_chat(self, qa):
        # Emoticon de robot
-        ia_emoticon = "\U0001f916"  # Emoticon de robot Unicode
+        ia_emoticon = "\U0001F916"  # Emoticon de robot Unicode
-        humano_emoticon = "\U0001f604"  # Emoticon de carita feliz Unicode
+        humano_emoticon = "\U0001F604"  # Emoticon de carita feliz Unicode
        # Imprimir el texto en amarillo y negrita con el emoticon de robot
        # Definicimo el mensaje de la IA
-        print(
+        print(f"{ia_emoticon} " + Style.BRIGHT + Fore.YELLOW +
-            f"{ia_emoticon} "
+              "IA: " + Style.RESET_ALL + "Pregunta algo al documento")
            + Style.BRIGHT
            + Fore.YELLOW
            + "IA: "
            + Style.RESET_ALL
            + "Pregunta algo al documento"
        )
        while True:
            input_usuario = input(
-                Style.BRIGHT + Fore.BLUE + f"{humano_emoticon} You: " + Style.RESET_ALL
+                Style.BRIGHT + Fore.BLUE + f"{humano_emoticon} You: " + Style.RESET_ALL)
-            )
+            if input_usuario.lower() == 'salir':
            if input_usuario.lower() == "salir":
                break
-            bot_response = qa.invoke(
+            bot_response = qa.invoke({"question": f"{input_usuario}"},
-                {"question": f"{input_usuario}"}, return_only_outputs=True
+                                     return_only_outputs=True)
-            )
+            print(f'{ia_emoticon} ' + Style.BRIGHT + Fore.YELLOW +
-            print(
+                  'IA:' + Style.RESET_ALL + f'{bot_response["answer"]}')
                f"{ia_emoticon} "
                + Style.BRIGHT
                + Fore.YELLOW
                + "IA:"
                + Style.RESET_ALL
                + f'{bot_response["answer"]}'
            )
-    def generate_citations(self, documents_source: list) -> str:
+    def generate_citations(self):
-        text_source: str = ""
+        pass
        for index, document in enumerate(documents_source):
            quote: str = document.page_content
            source: str = document.metadata["source"].replace("documents/pdfs/", "")
            page: str = document.metadata["page"] + 1
            fuente: str = (
                f"**Fuente #{index + 1}:** \n '{quote}'\n(*{source}, P.{page})*"
            )
            text_source += fuente + "\n\n\n"
        return text_source
--- a/chats/streamlit_tools.py
+++ b/chats/streamlit_tools.py
@ -1,48 +1,33 @@
 import streamlit as st
 import os
 # @st.cache_data
 def import_file() -> str:
    List_of_files: list = []
    # Cargar el archivo pdf
-    archivo = st.file_uploader("Arrastra o ingresa tu archivo .pdf", type=[".pdf"])
+    archivo = st.file_uploader(
-    nombre_archivo: str = ""
+        'Arrastra o ingresa tu archivo .pdf', type=['.pdf'])
    nombre_archivo: str = ''
    # Verificar si se ha cargado un archivo
    if archivo is not None:
        nombre_archivo = archivo.name
        # Agregamos el nombre a la lista de archvios para luego podr verificarlos
        List_of_files.append(nombre_archivo)
        # st.success(
        #    f"El numero de archivos en la lista  de archivos es de: {len(List_of_files)}"
        # )
        # Abrir un archivo en modo escritura binaria ('wb') para guardar el archivo de audio
-        with open(f"documents/pdfs/{nombre_archivo}", "wb") as new_file:
+
        with open(f'documents/pdfs/{nombre_archivo}', 'wb') as new_file:
            # Leer los datos del archivo cargado y escribirlos en el nuevo archivo
            new_file.write(archivo.read())
    # st.success(f"Se carga el archivo con nombre: {nombre_archivo}")
    # Verificamos que en la lista solo haya un archivo, de lo contrario, limpiamos la session_state
    if (
        "archivo_anterior" in st.session_state
        and st.session_state.archivo_anterior != nombre_archivo
    ):
        st.session_state.clear()
    st.session_state.archivo_anterior = nombre_archivo
    return nombre_archivo
 # Define la función para borrar el caché
 def clear_cache():
-    cache_path = os.path.join(st.__path__[0], "static", "cache")
+    cache_path = os.path.join(st.__path__[0], 'static', 'cache')
    for root, dirs, files in os.walk(cache_path):
        for file in files:
            os.remove(os.path.join(root, file))
-    st.success("Cache limpio exitosamente.")
+    st.success('Cache limpio exitosamente.')
--- a/documents/pdfs/1.TC_Malamud,
+++ b/documents/pdfs/1.TC_Malamud,
--- a/documents/pdfs/SocialBigDataSociologiaYCienciasSocialesComputacio.pdf
+++ b/documents/pdfs/SocialBigDataSociologiaYCienciasSocialesComputacio.pdf
--- a/documents/pdfs/logo_1-removebg-preview.png
+++ b/documents/pdfs/logo_1-removebg-preview.png
--- a/documents/pdfs/logo_1.jpeg
+++ b/documents/pdfs/logo_1.jpeg
--- a/documents/pdfs/se
+++ b/documents/pdfs/se
--- a/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/chroma.sqlite3
+++ b/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/chroma.sqlite3
--- a/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/data_level0.bin
+++ b/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/data_level0.bin
--- a/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/header.bin
+++ b/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/header.bin
--- a/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/length.bin
+++ b/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/length.bin
--- a/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/link_lists.bin
+++ b/embeddings/el_espejo_como_no-lugar/745425a9-bf48-4675-b32d-1167bc534894/link_lists.bin
--- a/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/data_level0.bin
+++ b/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/data_level0.bin
--- a/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/header.bin
+++ b/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/header.bin
--- a/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/length.bin
+++ b/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/length.bin
--- a/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/link_lists.bin
+++ b/embeddings/1.tc_malamud,_se_está_muriendo_la_democracia/bfe70be6-9587-4dc3-b193-b74ea4603585/link_lists.bin
--- a/embeddings/el_espejo_como_no-lugar/chroma.sqlite3
+++ b/embeddings/el_espejo_como_no-lugar/chroma.sqlite3
--- a/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/data_level0.bin
+++ b/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/data_level0.bin
--- a/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/header.bin
+++ b/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/header.bin
--- a/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/length.bin
+++ b/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/length.bin
--- a/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/link_lists.bin
+++ b/embeddings/se_está_muriendo_la_democracia/4c669150-7080-47b5-8d56-e59ab97de98c/link_lists.bin
--- a/embeddings/se_está_muriendo_la_democracia/chroma.sqlite3
+++ b/embeddings/se_está_muriendo_la_democracia/chroma.sqlite3
--- a/embeddings/socialbigdatasociologiaycienciassocialescomputacio/chroma.sqlite3
+++ b/embeddings/socialbigdatasociologiaycienciassocialescomputacio/chroma.sqlite3
--- a/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/data_level0.bin
+++ b/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/data_level0.bin
--- a/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/header.bin
+++ b/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/header.bin
--- a/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/length.bin
+++ b/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/length.bin
--- a/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/link_lists.bin
+++ b/embeddings/socialbigdatasociologiaycienciassocialescomputacio/eaf7d8a5-f2d4-4b12-a6d3-94efe772d2fd/link_lists.bin
--- a/langchain_tools/pycache/init.cpython-311.pyc
+++ b/langchain_tools/pycache/init.cpython-311.pyc
--- a/langchain_tools/pycache/lc_tools.cpython-311.pyc
+++ b/langchain_tools/pycache/lc_tools.cpython-311.pyc
--- a/langchain_tools/pycache/pdf_tools.cpython-311.pyc
+++ b/langchain_tools/pycache/pdf_tools.cpython-311.pyc
--- a/langchain_tools/lc_tools.py
+++ b/langchain_tools/lc_tools.py
@ -2,18 +2,17 @@ from langchain_openai import OpenAIEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_community.llms import OpenAI
 from langchain_community.chat_models import ChatOpenAI
 from langchain_core.prompts import PromptTemplate
 from langchain.memory.buffer import ConversationBufferMemory
 import os
 import streamlit as st
 from dotenv import load_dotenv
-from langchain.chains import RetrievalQAWithSourcesChain, ConversationalRetrievalChain
+from langchain.chains import (
-from langchain_community.llms import HuggingFaceEndpoint
+    RetrievalQAWithSourcesChain,
-from langchain_community.embeddings import HuggingFaceEmbeddings
+    ConversationalRetrievalChain)
 from streamlit.runtime.state import session_state
 from langchain_community.llms import Ollama
-class LangChainTools:
+class LangChainTools():
    """
    Esta clase maneja algunas herramientas integraciones con las que
    cuenta LangChain.
@ -28,38 +27,15 @@ class LangChainTools:
        # Cargamos la variable que contiene la api_key de OpenAI
        load_dotenv()
-        openai_api_key = os.getenv("api_key")
+        openai_api_key = os.getenv('api_key')
        # Define an OpenAI embeddings model
        self.embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)
        # st.success('El modelo de embeddins de OpneAI se ha cargado')
        return self.embedding_model
-    def load_embedding_hf(self):
+    @st.cache_resource
-        """Esta funcion carga un modelo de embedding de OpenAI
+    def create_vector_strore(_self, _docs_split: list, _file_name: str, _embedding_model):
        Returns:
            _type_: Retorno a un objetito de tipo embedding de OpenAI
        """
        huggingfacehub_api_token = "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"
        model_name = "sentence-transformers/all-mpnet-base-v2"
        model_kwargs = {"device": "cpu"}
        encode_kwargs = {"normalize_embeddings": False}
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs,
        )
        return self.embedding_model
    # @st.cache_resource
    def create_vector_strore(
        _self, _docs_split: list, _file_name: str, _embedding_model
    ):
        """Esta funcion construye un vector store a partir de un documento
        Args:
@ -67,36 +43,23 @@ class LangChainTools:
            _file_name (str): Nombre del documento
        """
-        # db_name = _file_name.replace(".pdf", "").replace(" ", "_").lower()
+        db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
        # st.success(_file_name)
        if "db_name" not in st.session_state.keys():
            st.session_state.db_name = (
                _file_name.replace(".pdf", "").replace(" ", "_").lower()
            )
        # Cargamos el modelo de embeddings
        # _embedding_model = self._embedding_model
        # Verificamos si existe la vector strore
-        # persist_directory = f"embeddings/{db_name}"
+        persist_directory = f"embeddings/{db_name}"
-        if "persist_director" not in st.session_state.keys():
+        if os.path.exists(persist_directory):
            st.session_state.persist_directory = (
                f"embeddings/{st.session_state.db_name}"
            )
        if os.path.exists(st.session_state.persist_directory):
            vectordb = Chroma(
-                persist_directory=st.session_state.persist_directory,
+                persist_directory=persist_directory,
-                embedding_function=_embedding_model,
+                embedding_function=_embedding_model)
            )
        else:
            vectordb = Chroma.from_documents(
-                persist_directory=st.session_state.persist_directory,
+                persist_directory=persist_directory,
                documents=_docs_split,
-                embedding=_embedding_model,
+                embedding=_embedding_model)
            )
            vectordb.persist()
@ -111,67 +74,36 @@ class LangChainTools:
        # Cargamos la variable que contiene la api_key de OpenAI
        load_dotenv()
-        openai_api_key = os.getenv("api_key")
+        openai_api_key = os.getenv('api_key')
        temperature = 0.5
-        llm_openai = ChatOpenAI(
+        llm_openai = ChatOpenAI(model_name="gpt-3.5-turbo",
            model_name="gpt-3.5-turbo",
                                temperature=temperature,
                                openai_api_key=openai_api_key,
-            max_tokens=1000,
+                                max_tokens=1000)
        )
        return llm_openai
-    @st.cache_resource
+    def load_prompt_template(self):
-    def load_llm_open_source(_self):
+        """Esta funcion construye un prompt template de lanfchain.
        """Esta funcion carga un modelo de LLM OpenSource desde HuggingFace
        Returns:
-            _type_: Retorno a un objetito de tipo LLM de OpenAI
+            _type_: Retorno a un prompt template de LangChain.
        """
-        # model_huggingface = "google/gemma-1.1-7b-it"  # Es buena y funciona en espanol
+        template = """Responde a la siguiente pregunta utilizando los documentos proporcionados y citando las fuentes relevantes entre corchetes []:
        # model_huggingface = (
        #     "google/gemma-1.1-2b-it"  # Es buena y funciona en espanol funciona rapido
        # )
        # model_huggingface = "tiiuae/falcon-7b-instruct"
        # model_huggingface = "mistralai/Mistral-7B-Instruct-v0.2"
        # model_huggingface = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
        huggingfacehub_api_token = "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"
-        model_huggingface = "mistralai/Mixtral-8x7B-Instruct-v0.1"  # Es buena y funciona en espanol funciona rapido
+        Pregunta: {question}
-        # Define the LLM
+        Respuesta:"""
        llm = HuggingFaceEndpoint(
            repo_id=model_huggingface,
            huggingfacehub_api_token=huggingfacehub_api_token,
            temperature=0.1,
            max_new_tokens=1000,
        )
-        return llm
+        prompt_template = PromptTemplate(
            template=template, input_variables=["question"])
-    @st.cache_resource
+        return prompt_template
    def load_llm_ollama(_self):
        """Esta funcion carga un modelo de LLM OpenSource desde Ollama
-        Returns:
+    def define_retrieval_qa(
-            _type_: Retorno a un objetito de tipo LLM de OpenAI
+        self, _llm, _vectordb, _file_name, _embedding_model
-        """
+    ):
        # Elegimos el modelo de Ollama que utilizaremos
        model: str = "llama2:7b"
        llm = Ollama(
            model=model,
            temperature=0.1,
            num_ctx=1000,
        )
        return llm
    def define_retrieval_qa(self, _llm, _vectordb, _file_name, _embedding_model):
        """Esta función integra un LLM y una base de datos vectorial en una
        chain de LangChain para hacer requerimientos. Este modelo no integra memoria.
@ -188,14 +120,14 @@ class LangChainTools:
            y la BDV.
        """
-        db_name = _file_name.replace(".pdf", "").replace(" ", "_").lower()
+        db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
        # Verificamos si existe la vector strore
        persist_directory = f"embeddings/{db_name}"
        _vectordb = Chroma(
-            persist_directory=persist_directory, embedding_function=_embedding_model
+            persist_directory=persist_directory,
-        )
+            embedding_function=_embedding_model)
        # Define the Retrieval QA Chain to integrate the database and LLM
        qa = RetrievalQAWithSourcesChain.from_chain_type(
@ -209,8 +141,10 @@ class LangChainTools:
        return qa
-    # @st.cache_resource
+    @st.cache_resource
-    def define_retrieval_qa_memory(_self, _llm, _file_name, _embedding_model):
+    def define_retrieval_qa_memory(
        _self, _llm, _vectordb, _file_name, _embedding_model
    ):
        """Esta función integra un LLM y una base de datos vectorial en una
        chain de LangChain para hacer requerimientos. Este modelo integra memoria.
@ -227,45 +161,25 @@ class LangChainTools:
            y la BDV.
        """
-        # db_name = _file_name.replace(".pdf", "").replace(" ", "_").lower()
+        db_name = _file_name.replace('.pdf', '').replace(' ', '_').lower()
        # Verificamos si existe la vector strore
-        # persist_directory = f"embeddings/{db_name}"
+        persist_directory = f"embeddings/{db_name}"
        _vectordb = Chroma(
-            persist_directory=st.session_state.persist_directory,
+            persist_directory=persist_directory,
-            embedding_function=_embedding_model,
+            embedding_function=_embedding_model)
        )
        # Configura la memoria
        memory = ConversationBufferMemory(
-            memory_key="chat_history", return_messages=True, output_key="answer"
+            memory_key="chat_history", return_messages=True)
        )
        # Define the Retrieval QA Chain to integrate the database and LLM
        conversation = ConversationalRetrievalChain.from_llm(
            _llm,
            retriever=_vectordb.as_retriever(),
            memory=memory,
-            verbose=True,  # Modo verboso
+            verbose=False  # Modo verboso
            return_source_documents=True,  # Devuelve los documentos fuente
        )
        template = """Utiliza los siguientes fragmentos de contexto para responder en español la pregunta al final. Si no sabes la respuesta, simplemente di que no sabes, no intentes inventar una respuesta.
        {context}
        Pregunta: {question}
        Respuesta:"""
        # template = """Utiliza los siguientes fragmentos de contexto como ejemplo para responder la pregunta al final. Organiza tu respuesta de manera clara y concisa, proporcionando información relevante y evitando divagaciones innecesarias.
        # {context}
        # Pregunta: {question}
        # Respuesta en español:"""
        conversation.combine_docs_chain.llm_chain.prompt.template = template
        # conversation.question_generator.prompt.template = "Dado el siguiente diálogo y una pregunta de seguimiento, reformula la pregunta de seguimiento para que sea una pregunta independiente, en su idioma original.\n\nHistorial del chat:\n{chat_history}\nPregunta de seguimiento: {question}\nPregunta independiente:"
        return conversation
--- a/langchain_tools/pdf_tools.py
+++ b/langchain_tools/pdf_tools.py
@ -22,7 +22,7 @@ class PdfLangChain:
        """
        self.file_name = file_name
-        self.file_path = os.path.join("documents", "pdfs", self.file_name)
+        self.file_path = os.path.join('documents', 'pdfs', self.file_name)
        # Verificar si el directorio exist, sino, crearlo
        if not os.path.exists(self.file_path):
@ -38,8 +38,6 @@ class PdfLangChain:
        loader = PyPDFLoader(_self.file_path)
        _self.docs = loader.load()
        # st.success(f"Se carga el pdf : {_self.file_path}")
        return _self.docs
    def split_docs(self, data: list) -> list:
@ -56,10 +54,8 @@ class PdfLangChain:
        chunk_overlap = 300
        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size, chunk_overlap=chunk_overlap
+            chunk_size=chunk_size,
-        )
+            chunk_overlap=chunk_overlap)
        self.docs_split = splitter.split_documents(data)
        # st.success(f"{self.file_path[3][:200]}")
        return self.docs_split
--- a/pruebas.py
+++ b/pruebas.py
@ -1,8 +0,0 @@
 from langchain_community.llms import Ollama
 llm = Ollama(model="gemma")
 print(llm.invoke("Cual es tu nombre?"))
 query = "Cual es el sentido de la vida?"
--- a/pruebas_open_source.py
+++ b/pruebas_open_source.py
@ -1,67 +0,0 @@
 # from langchain_tools.lc_tools import LangChainTools
 from langchain_community.llms import HuggingFaceEndpoint
 # Instanciamos la clase LangChainTools que contiene herramientras LangChain
 # langChainTools = LangChainTools()
 # model_huggingface = "google/gemma-1.1-7b-it"  # Es buena y funciona en espanol
 # model_huggingface = (
 #    "google/gemma-1.1-2b-it"  # Es buena y funciona en espanol funciona rapido
 # )
 # model_huggingface = 'tiiuae/falcon-7b-instruct'
 model_huggingface = "mistralai/Mistral-7B-Instruct-v0.2"
 huggingfacehub_api_token = "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"
 # model_huggingface = "mistralai/Mixtral-8x22B-Instruct-v0.1"  # Es buena y funciona en espanol funciona rapido
 # Define the LLM
 llm = HuggingFaceEndpoint(
    repo_id=model_huggingface,
    huggingfacehub_api_token=huggingfacehub_api_token,
    temperature=0.5,
    max_new_tokens=500,
 )  # Cargamos el modelo LLM desde LangChainllm llm = langChainTools.load_llm_open_source()
 # respuesta = llm.invoke("Cual es el sentido de la vida?")
 # print(respuesta)
 import streamlit as st
 from chats.streamlit_tools import import_file  # ,clear_cache
 from streamlit_extras.add_vertical_space import add_vertical_space
 from langchain_tools.pdf_tools import PdfLangChain
 from langchain_tools.lc_tools import LangChainTools
 from chats.chat_tools import MessageManager
 pdf_name = "1.TC_Malamud, Se está muriendo la democracia.pdf"
 pdfLangChain = PdfLangChain(pdf_name)
 # Cargamos el documento PDF
 docs: list = pdfLangChain.load_pdf()
 # Dividimos los documentos en partes mas pequenas
 docs_split: list = pdfLangChain.split_docs(docs)
 # Instanciamos la clase LangChainTools que contiene herramientras LangChain
 langChainTools = LangChainTools()
 # Cargamos el modelo de embeddings
 # embedding_model = langChainTools.load_embedding_opnai()
 # Cargamos el modelo de embeddings
 embedding_model = langChainTools.load_embedding_hf()
 # Creamos el vector store
 docstorage = langChainTools.create_vector_strore(docs_split, pdf_name, embedding_model)
 # Cargamos el modelo LLM desde LangChain
 llm = langChainTools.load_llm_open_source()
 # Creamos la cadena que integra Vectorstroe, el LLM para hacer consultas.Para este caso la cadena tene el parametro de memoria.
 qa = langChainTools.define_retrieval_qa_memory(
    llm, docstorage, pdf_name, embedding_model
 )
 # qa.question_generator.prompt.template = "Dado el siguiente diálogo y una pregunta de seguimiento, reformula la pregunta de seguimiento para que sea una pregunta independiente, en su idioma original.\n\nHistorial del chat:\n{chat_history}\nPregunta de seguimiento: {question}\nPregunta independiente:"
 print(qa)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,163 +1,13 @@
 aiohttp==3.9.5
 aiosignal==1.3.1
 altair==5.3.0
 annotated-types==0.6.0
 anyio==4.3.0
 asgiref==3.8.1
 attrs==23.2.0
 backoff==2.2.1
 bcrypt==4.1.2
 beautifulsoup4==4.12.3
 blinker==1.7.0
 build==1.2.1
 cachetools==5.3.3
 certifi==2024.2.2
 charset-normalizer==3.3.2
 chroma-hnswlib==0.7.3
 chromadb==0.5.0
 click==8.1.7
 colorama==0.4.6
 coloredlogs==15.0.1
 contourpy==1.2.1
 cycler==0.12.1
 dataclasses-json==0.6.4
 Deprecated==1.2.14
 distro==1.9.0
 entrypoints==0.4
 Faker==24.14.0
 fastapi==0.110.2
 favicon==0.7.0
 filelock==3.13.4
 flatbuffers==24.3.25
 fonttools==4.51.0
 frozenlist==1.4.1
 fsspec==2024.3.1
 gitdb==4.0.11
 GitPython==3.1.43
 google-auth==2.29.0
 googleapis-common-protos==1.63.0
 greenlet==3.0.3
 grpcio==1.62.2
 h11==0.14.0
 htbuilder==0.6.2
 httpcore==1.0.5
 httptools==0.6.1
 httpx==0.27.0
 huggingface-hub==0.22.2
 humanfriendly==10.0
 idna==3.7
 importlib-metadata==7.0.0
 importlib_resources==6.4.0
 Jinja2==3.1.3
 jsonpatch==1.33
 jsonpointer==2.4
 jsonschema==4.21.1
 jsonschema-specifications==2023.12.1
 kiwisolver==1.4.5
 kubernetes==29.0.0
 langchain==0.1.16
 langchain-community==0.0.34
 langchain-core==0.1.45
 langchain-openai==0.1.3
 langchain-text-splitters==0.0.1
 langsmith==0.1.50
 lxml==5.2.1
 Markdown==3.6
 markdown-it-py==3.0.0
 markdownlit==0.0.7
 MarkupSafe==2.1.5
 marshmallow==3.21.1
 matplotlib==3.8.4
 mdurl==0.1.2
 mmh3==4.1.0
 monotonic==1.6
 more-itertools==10.2.0
 mpmath==1.3.0
 multidict==6.0.5
 mypy-extensions==1.0.0
 numpy==1.26.4
 oauthlib==3.2.2
 onnxruntime==1.17.3
 openai==1.23.5
-opentelemetry-api==1.24.0
+
 opentelemetry-exporter-otlp-proto-common==1.24.0
 opentelemetry-exporter-otlp-proto-grpc==1.24.0
 opentelemetry-instrumentation==0.45b0
 opentelemetry-instrumentation-asgi==0.45b0
 opentelemetry-instrumentation-fastapi==0.45b0
 opentelemetry-proto==1.24.0
 opentelemetry-sdk==1.24.0
 opentelemetry-semantic-conventions==0.45b0
 opentelemetry-util-http==0.45b0
 orjson==3.10.1
 overrides==7.7.0
 packaging==23.2
 pandas==2.2.2
 pillow==10.3.0
 posthog==3.5.0
 prometheus_client==0.20.0
 protobuf==4.25.3
 pyarrow==16.0.0
 pyasn1==0.6.0
 pyasn1_modules==0.4.0
 pydantic==2.7.1
 pydantic_core==2.18.2
 pydeck==0.9.0b1
 Pygments==2.17.2
 pymdown-extensions==10.8
 pyparsing==3.1.2
 pypdf==4.2.0
 PyPika==0.48.9
 pyproject_hooks==1.0.0
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2024.1
 PyYAML==6.0.1
 referencing==0.35.0
 regex==2024.4.16
 requests==2.31.0
 requests-oauthlib==2.0.0
 rich==13.7.1
 rpds-py==0.18.0
 rsa==4.9
 shellingham==1.5.4
 six==1.16.0
 smmap==5.0.1
 sniffio==1.3.1
 soupsieve==2.5
 SQLAlchemy==2.0.29
 st-annotated-text==4.0.1
 starlette==0.37.2
 streamlit==1.33.0
 streamlit-camera-input-live==0.2.0
 streamlit-card==1.0.0
 streamlit-embedcode==0.1.2
 streamlit-extras==0.4.2
 streamlit-faker==0.0.3
 streamlit-image-coordinates==0.1.6
 streamlit-keyup==0.2.4
 streamlit-toggle-switch==1.0.2
 streamlit-vertical-slider==2.5.5
 sympy==1.12
 tenacity==8.2.3
 tiktoken==0.6.0
 tokenizers==0.19.1
 toml==0.10.2
 toolz==0.12.1
 tornado==6.4
 tqdm==4.66.2
 typer==0.12.3
 typing-inspect==0.9.0
 typing_extensions==4.11.0
 tzdata==2024.1
 urllib3==2.2.1
 uvicorn==0.29.0
 uvloop==0.19.0
 validators==0.28.1
 watchdog==4.0.0
 watchfiles==0.21.0
 websocket-client==1.8.0
 websockets==12.0
 wrapt==1.16.0
 yarl==1.9.4
 zipp==3.18.1
`@ -1,2 +1,2 @@`
	`# chat_pdf`	`# chat_pdf`
	`El siguiente proyecto construye un chatbot que usa un LLM, Langchain y streamlit para crear una aplicacion que permite chatear con uun pdf.`	`El siguiente proyecto construye un chatbot que usa un LLM, Langchain y streamlit para crear una aplicacion que permite chatear con un pdf.`