TintinMeimei commited on
Commit
d938037
1 Parent(s): 2ac7b58

Upload 3 files

Browse files
Files changed (3) hide show
  1. tmp/algo.py +143 -0
  2. tmp/demo_ai_search.py +36 -0
  3. tmp/lvchan.xlsx +0 -0
tmp/algo.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ import jieba
4
+ import pandas as pd
5
+ from sentence_transformers import SentenceTransformer, util
6
+
7
+
8
+ class AlgoRule:
9
+
10
+
11
+ def __init__(self) -> None:
12
+ df_lvchan = pd.read_excel('lvchan.xlsx', sheet_name='Sheet1')
13
+ df_lvchan.columns = df_lvchan.iloc[0]
14
+ df_lvchan = df_lvchan[1:]
15
+ sep = r'[,、]'
16
+ self.dict_rule_index = {
17
+ 'kuan': {},
18
+ 'wuxiang': {},
19
+ 'wuxiang_xianding': {},
20
+ }
21
+ for _, row in df_lvchan.iterrows():
22
+ item = row['三级标题']
23
+ for word in re.split(sep, row['宽口径(复核)']):
24
+ self.dict_rule_index['kuan'].setdefault(word, []).append(item)
25
+ for word in re.split(sep, row['物象关键词(复核)']):
26
+ self.dict_rule_index['wuxiang'].setdefault(word, []).append(item)
27
+ for word2 in re.split(sep, row['限定词(复核)']):
28
+ self.dict_rule_index['wuxiang_xianding'].setdefault('_'.join([word, word2]), []).append(item)
29
+ for k in self.dict_rule_index.keys():
30
+ for key in self.dict_rule_index[k].keys():
31
+ self.dict_rule_index[k][key] = list(set(self.dict_rule_index[k][key]))
32
+
33
+
34
+ def _tokenize(self, text):
35
+ tokens = [tok for tok in jieba.cut(text)]
36
+ return tokens
37
+
38
+
39
+ def _is_match(self, word, query):
40
+ items = self._tokenize(query)
41
+ for item in items:
42
+ if item == word:
43
+ return True
44
+ return False
45
+
46
+
47
+ def _match(self, query):
48
+ result = {}
49
+ matches = {
50
+ 'wuxiang_xianding': [],
51
+ 'wuxiang': [],
52
+ 'kuan': [],
53
+ }
54
+ # Test 1st route: match both wuxiang and xianding
55
+ flag = False
56
+ for key in self.dict_rule_index['wuxiang_xianding'].keys():
57
+ wuxiang, xianding = key.split('_')
58
+ items = self.dict_rule_index['wuxiang_xianding'][key]
59
+ if self._is_match(wuxiang, query) and self._is_match(xianding, query):
60
+ # if wuxiang in query and xianding in query:
61
+ for item in items:
62
+ r = result.setdefault(item, {})
63
+ r.setdefault('限定词+物项关键词', []).append('+'.join([xianding, wuxiang]))
64
+ flag = True
65
+ if flag is True:
66
+ # clean result
67
+ for key1 in result.keys():
68
+ for key2 in result[key1].keys():
69
+ result[key1][key2] = ' ; '.join(result[key1][key2])
70
+ return result
71
+ # Test 2nd route: match wuxiang only
72
+ r2 = ''
73
+ for key in self.dict_rule_index['wuxiang'].keys():
74
+ items = self.dict_rule_index['wuxiang'][key]
75
+ if self._is_match(key, query):
76
+ # if key in query:
77
+ for item in items:
78
+ r = result.setdefault(item, {})
79
+ r.setdefault('物项关键词', []).append(key)
80
+ # Test 3rd route: match kuan
81
+ r3 = ''
82
+ for key in self.dict_rule_index['kuan'].keys():
83
+ items = self.dict_rule_index['kuan'][key]
84
+ if self._is_match(key, query):
85
+ # if key in query:
86
+ for item in items:
87
+ r = result.setdefault(item, {})
88
+ r.setdefault('宽口径', []).append(key)
89
+ # clean result
90
+ for key1 in result.keys():
91
+ for key2 in result[key1].keys():
92
+ result[key1][key2] = ' ; '.join(result[key1][key2])
93
+ return result
94
+
95
+
96
+ def algo(self, query):
97
+ result = self._match(query)
98
+ result = [item.strip() for item in result.keys()]
99
+ return result
100
+
101
+
102
+ class AlgoAI:
103
+
104
+
105
+ def __init__(self) -> None:
106
+ # self.model = SentenceTransformer('DMetaSoul/sbert-chinese-general-v2')
107
+ self.model = SentenceTransformer('TintinMeimei/menglang_yongtulv_aimatch_v1')
108
+ df_lvchan = pd.read_excel('../lvchan.xlsx', sheet_name='Sheet1')
109
+ df_lvchan.columns = df_lvchan.iloc[0]
110
+ df_lvchan = df_lvchan[1:]
111
+ dict_lvchan = dict((row['三级标题'].strip(), '\n'.join([row['三级标题'].strip(), row['解释说明']])) for _, row in df_lvchan.iterrows())
112
+ self.dict_lvchan_vectors = dict((key, self.model.encode(dict_lvchan[key], convert_to_tensor=True)) for key in dict_lvchan.keys())
113
+ self.thres = 0.25
114
+
115
+
116
+ def _sim(self, query, item):
117
+ emb1 = self.model.encode(query, convert_to_tensor=True)
118
+ emb2 = item
119
+ score = util.cos_sim(emb1, emb2)
120
+ return score
121
+
122
+
123
+ def _match(self, query):
124
+ result = []
125
+ for key in self.dict_lvchan_vectors.keys():
126
+ score = self._sim(query, self.dict_lvchan_vectors[key])
127
+ if score > self.thres:
128
+ result.append(key)
129
+ return result
130
+
131
+
132
+ def algo(self, query):
133
+ result = self._match(query)
134
+ return result
135
+
136
+
137
+ if __name__ == '__main__':
138
+ algo = AlgoRule()
139
+ query = '无害生活垃圾'
140
+ print(algo.algo(query))
141
+
142
+ algo2 = AlgoAI()
143
+ print(algo2.algo(query))
tmp/demo_ai_search.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import algo
4
+
5
+ algo_rule = algo.AlgoRule()
6
+ algo_ai = algo.AlgoAI()
7
+
8
+ def process(query):
9
+ r1 = algo_rule.algo(query)
10
+ r1 = sorted(r1)
11
+ text_r1 = ''
12
+ for item in r1:
13
+ text_r1 += '\n'+'- '+item
14
+
15
+ r2 = algo_ai.algo(query)
16
+ text_r2 = ''
17
+ for item in r2:
18
+ text_r2 += '\n'+'- '+item
19
+
20
+ output = f'''
21
+ 绿产目录匹配结果 - 关键词规则:
22
+ {text_r1}
23
+
24
+
25
+ 绿产目录匹配结果 - AI匹配:
26
+ {text_r2}
27
+ '''
28
+ return output
29
+
30
+
31
+ # We instantiate the Textbox class
32
+ textbox_input = gr.Textbox(label="输入", placeholder="", lines=2)
33
+ textbox_output = gr.Textbox(label="绿产目录匹配", placeholder="", lines=15)
34
+
35
+ demo = gr.Interface(fn=process, inputs=textbox_input, outputs=textbox_output)
36
+ demo.launch(share=False, server_name='0.0.0.0', server_port=8001)
tmp/lvchan.xlsx ADDED
Binary file (79.3 kB). View file