Spaces:
Paused
Paused
File size: 885 Bytes
84deff7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# pdf_processor.py
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
def process_pdf(pdf_url: str) -> str:
logging.info(f"Processing PDF from URL: {pdf_url}")
try:
loader = PyMuPDFLoader(pdf_url)
data = loader.load()
if not data:
logging.warning(f"No data found in PDF at {pdf_url}")
return ""
return "\n".join([page.page_content for page in data])
except Exception as e:
logging.error(f"Failed to process PDF at {pdf_url}: {str(e)}")
return ""
def split_text(text: str) -> List[str]:
return text_splitter.split_text(text)
|