arxiv-rag-mvp / pdf_processor.py
donb-hf's picture
update services
84deff7
raw
history blame contribute delete
No virus
885 Bytes
# pdf_processor.py
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
def process_pdf(pdf_url: str) -> str:
logging.info(f"Processing PDF from URL: {pdf_url}")
try:
loader = PyMuPDFLoader(pdf_url)
data = loader.load()
if not data:
logging.warning(f"No data found in PDF at {pdf_url}")
return ""
return "\n".join([page.page_content for page in data])
except Exception as e:
logging.error(f"Failed to process PDF at {pdf_url}: {str(e)}")
return ""
def split_text(text: str) -> List[str]:
return text_splitter.split_text(text)