62 lines
1.6 KiB
Python
62 lines
1.6 KiB
Python
import os
|
|
from langchain_community.document_loaders.pdf import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
import streamlit as st
|
|
|
|
|
|
class PdfLangChain:
|
|
"""Clase para menejar documentos pdf con LangChain.
|
|
.
|
|
|
|
Attributes:
|
|
file_name (str): Nombre del archivo PDF.
|
|
file_path (str): Ruta del archivo PDF.
|
|
"""
|
|
|
|
def __init__(self, file_name: str) -> None:
|
|
"""
|
|
Inicializa la clase.
|
|
|
|
Args:
|
|
file_name (str): Nombre del archivo PDF.
|
|
"""
|
|
|
|
self.file_name = file_name
|
|
self.file_path = os.path.join('documents', 'pdfs', self.file_name)
|
|
|
|
# Verificar si el directorio exist, sino, crearlo
|
|
if not os.path.exists(self.file_path):
|
|
os.mkdir(self.file_path)
|
|
|
|
@st.cache_resource
|
|
def load_pdf(_self):
|
|
"""Carga el documento PDF.
|
|
|
|
Returns:
|
|
_type_: Lista de documentos cargados.
|
|
"""
|
|
|
|
loader = PyPDFLoader(_self.file_path)
|
|
_self.docs = loader.load()
|
|
return _self.docs
|
|
|
|
def split_docs(self, data: list) -> list:
|
|
"""Divide los documentos en fragmentos más pequeños.
|
|
|
|
Args:
|
|
data (list): Listra de docuemntos cargador
|
|
|
|
Returns:
|
|
_type_: Retrorna una lista con fragmentos mas pequeños
|
|
del documento.
|
|
"""
|
|
chunk_size = 2000
|
|
chunk_overlap = 300
|
|
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap)
|
|
self.docs_split = splitter.split_documents(data)
|
|
|
|
return self.docs_split
|