File size: 885 Bytes
84deff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# pdf_processor.py
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

def process_pdf(pdf_url: str) -> str:
    logging.info(f"Processing PDF from URL: {pdf_url}")
    try:
        loader = PyMuPDFLoader(pdf_url)
        data = loader.load()
        if not data:
            logging.warning(f"No data found in PDF at {pdf_url}")
            return ""
        return "\n".join([page.page_content for page in data])
    except Exception as e:
        logging.error(f"Failed to process PDF at {pdf_url}: {str(e)}")
        return ""

def split_text(text: str) -> List[str]:
    return text_splitter.split_text(text)