20 lines
569 B
Python
20 lines
569 B
Python
from langchain_community.document_loaders.pdf import PyPDFLoader
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
import os
|
|
|
|
|
|
def load_split_docs(file_name: str) -> list:
|
|
|
|
file_path: str = os.path.join("documents", "pdfs", file_name)
|
|
loader = PyPDFLoader(file_path)
|
|
docs: list = loader.load()
|
|
chunk_size: int = 2000
|
|
chunk_overlap: int = 300
|
|
|
|
splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
)
|
|
docs_split: list = splitter.split_documents(docs)
|
|
|
|
return docs_split
|