Spaces:
Paused
Paused
# pdf_processor.py | |
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
def process_pdf(pdf_url: str) -> str: | |
logging.info(f"Processing PDF from URL: {pdf_url}") | |
try: | |
loader = PyMuPDFLoader(pdf_url) | |
data = loader.load() | |
if not data: | |
logging.warning(f"No data found in PDF at {pdf_url}") | |
return "" | |
return "\n".join([page.page_content for page in data]) | |
except Exception as e: | |
logging.error(f"Failed to process PDF at {pdf_url}: {str(e)}") | |
return "" | |
def split_text(text: str) -> List[str]: | |
return text_splitter.split_text(text) | |