43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
from langchain_community.document_loaders.pdf import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
import os
|
|
|
|
|
|
# def load_split_docs(file_name: str) -> list:
|
|
# file_path: str = os.path.join("documents", "pdfs", file_name)
|
|
# loader = PyPDFLoader(file_path)
|
|
# docs: list = loader.load()
|
|
# chunk_size: int = 2000
|
|
# chunk_overlap: int = 300
|
|
#
|
|
# splitter = RecursiveCharacterTextSplitter(
|
|
# chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
# )
|
|
# docs_split: list = splitter.split_documents(docs)
|
|
#
|
|
# return docs_split
|
|
|
|
|
|
def load_split_docs(file_name: str) -> list:
|
|
# Obtener el directorio base del proyecto
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
# Construir la ruta absoluta al PDF
|
|
file_path = os.path.join(base_dir, "documents", "pdfs", file_name)
|
|
|
|
# Verificar si el archivo existe
|
|
if not os.path.exists(file_path):
|
|
print(f"Archivo no encontrado en: {file_path}")
|
|
raise FileNotFoundError(f"No se encontró el archivo en: {file_path}")
|
|
|
|
loader = PyPDFLoader(file_path)
|
|
docs: list = loader.load()
|
|
|
|
chunk_size: int = 2000
|
|
chunk_overlap: int = 300
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
)
|
|
docs_split: list = splitter.split_documents(docs)
|
|
return docs_split
|