################ # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기 ################ import tiktoken tokenizer = tiktoken.get_encoding('cl100k_base') def tiktoken_len(text): tokens = tokenizer.encode(text) return len(tokens) from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import UnstructuredMarkdownLoader from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores.utils import filter_complex_metadata ## pdf 파일로드 하고 쪼개기 # https://python.langchain.com/v0.2/docs/how_to/document_loader_markdown/ # 마크다운 파일을 로드하고 분할 loader = UnstructuredMarkdownLoader('Document/Knowledge.md', mode="elements") pages = loader.load_and_split() # 텍스트를 청크로 분할 text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80, length_function=tiktoken_len) sourceDocs = text_splitter.split_documents(pages) sourceDocs = filter_complex_metadata(sourceDocs) ################ # HuggingFace 모델로 문서벡터화 후 유사도 탐색 ################ from langchain.vectorstores import Chroma model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', model_kwargs = {'device':'cpu'}, encode_kwargs = {'normalize_embeddings' : True}) ## Chroma 기반 pdf(docs 벡터화) db = Chroma.from_documents(sourceDocs, model_huggingface) ## 질의하기 def SearchDocs(question, k=4): results = db.similarity_search_with_relevance_scores(question, k = k) merged = '' for result in results: merged += '\n\n' + result[0].page_content return merged # # 질의 테스트 # question = "자연어 처리란 무엇인가요?" # print(SearchDocs(question, k=1))