DonConfia_Dev/agents/app/rag/split_docs.py
2025-03-24 20:33:04 -05:00

43 lines
1.4 KiB
Python

from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
# def load_split_docs(file_name: str) -> list:
# file_path: str = os.path.join("documents", "pdfs", file_name)
# loader = PyPDFLoader(file_path)
# docs: list = loader.load()
# chunk_size: int = 2000
# chunk_overlap: int = 300
#
# splitter = RecursiveCharacterTextSplitter(
# chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )
# docs_split: list = splitter.split_documents(docs)
#
# return docs_split
def load_split_docs(file_name: str) -> list:
# Obtener el directorio base del proyecto
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Construir la ruta absoluta al PDF
file_path = os.path.join(base_dir, "documents", "pdfs", file_name)
# Verificar si el archivo existe
if not os.path.exists(file_path):
print(f"Archivo no encontrado en: {file_path}")
raise FileNotFoundError(f"No se encontró el archivo en: {file_path}")
loader = PyPDFLoader(file_path)
docs: list = loader.load()
chunk_size: int = 2000
chunk_overlap: int = 300
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
docs_split: list = splitter.split_documents(docs)
return docs_split