{user_name} commited on
Commit
b16918e
โ€ข
1 Parent(s): d0c988b

Update space

Browse files
Files changed (1) hide show
  1. Module/rag.py +67 -0
Module/rag.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ################
2
+ # PDF ํŒŒ์ผ์„ ๋กœ๋“œํ•˜๊ณ  ๋ฌธ์„œ๋ฅผ ์ชผ๊ฐœ์„œ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ•œ ํ›„ ์งˆ์˜ํ•˜๊ธฐ
3
+ ################
4
+ import tiktoken
5
+ tokenizer = tiktoken.get_encoding('cl100k_base')
6
+ def tiktoken_len(text):
7
+ tokens = tokenizer.encode(text)
8
+ return len(tokens)
9
+
10
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.vectorstores import Chroma
13
+ from langchain.document_loaders import PyPDFLoader
14
+ from langchain.embeddings import HuggingFaceEmbeddings
15
+
16
+ ## pdf ํŒŒ์ผ๋กœ๋“œ ํ•˜๊ณ  ์ชผ๊ฐœ๊ธฐ
17
+ loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
18
+ pages = loader.load_and_split()
19
+
20
+ ## chunk๋กœ ์ชผ๊ฐœ๊ธฐ
21
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
22
+ sourceDocs = text_splitter.split_documents(pages)
23
+
24
+ ################
25
+ # HuggingFace ๋ชจ๋ธ๋กœ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ›„ ์œ ์‚ฌ๋„ ํƒ์ƒ‰
26
+ ################
27
+ from langchain.embeddings import HuggingFaceEmbeddings
28
+
29
+ model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
30
+ model_kwargs = {'device':'cpu'},
31
+ encode_kwargs = {'normalize_embeddings' : True})
32
+
33
+ ## Chroma ๊ธฐ๋ฐ˜ pdf(docs ๋ฒกํ„ฐํ™”)
34
+ db = Chroma.from_documents(sourceDocs, model_huggingface)
35
+
36
+ ## ์งˆ์˜ํ•˜๊ธฐ
37
+ question = '์‚ผ์„ฑ์ „์ž์˜ ์ฃผ์š” ์‚ฌ์—…์˜์—ญ์€?'
38
+ docs3 = db.similarity_search_with_relevance_scores(question, k = 1) # 2๊ฐœ ํ•˜๋‹ˆ ๋„ˆ๋ฌด ๋Š๋ฆฐ๊ฑด๊ฐ€? ๋‹ต๋ณ€์ด ์•ˆ๋‚˜์˜ค๋Š”๋ฐ..?
39
+
40
+ # ํŒŒ์ผ๋กœ ์ €์žฅํ•˜๊ณ  ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
41
+ # db_toFiles = Chroma.from_documents(docs, model_huggingface, persist_directory = './samsumg.db')
42
+ # db_fromfile = Chroma(persist_directory = './samsumg.db',embedding_function=model_huggingface)
43
+ # docs3 = db_fromfile.similarity_search_with_relevance_scores(question,k=3)
44
+
45
+ joinDoc = ' '.join([doc[0].page_content for doc in docs3])
46
+ print(joinDoc)
47
+
48
+ ################
49
+ # ์ฐพ์€ ๋ฌธ์„œ๋ฅผ ํ”„๋กฌํ”„ํŠธ์— ์ „๋‹ฌํ•˜์—ฌ LLM์œผ๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ
50
+ ################
51
+ from langchain_community.chat_models import ChatOllama
52
+ llm = ChatOllama(
53
+ base_url='http://localhost:11434',
54
+ # model="phi3:medium", # ๋„ˆ๋ฌด ๋Š๋ ค์„œ mini๋กœ ๋ณ€๊ฒฝ
55
+ model="phi3:mini",
56
+ )
57
+
58
+ from langchain_core.prompts import ChatPromptTemplate
59
+
60
+ prompt = ChatPromptTemplate.from_messages([
61
+ ("system", "Please answer the following question from the document: {document}"),
62
+ ("user", "{question}"),
63
+ ])
64
+
65
+ print('-'*50)
66
+ chain = prompt | llm
67
+ print(chain.invoke({"question": question, "document": joinDoc}))