File size: 6,453 Bytes
14e17ef
 
 
 
 
 
 
 
ee79db1
14e17ef
 
 
 
 
 
f94a42e
 
ee79db1
 
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
14e17ef
 
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
14e17ef
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
14e17ef
f94a42e
14e17ef
 
f94a42e
14e17ef
 
f94a42e
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
f94a42e
14e17ef
 
f94a42e
14e17ef
f94a42e
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
f94a42e
14e17ef
 
 
f94a42e
14e17ef
 
f94a42e
14e17ef
f94a42e
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
f94a42e
 
 
 
14e17ef
 
f94a42e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14e17ef
 
 
 
 
 
 
 
 
 
 
 
 
f94a42e
ee79db1
f94a42e
 
 
 
 
 
 
 
 
 
 
ee79db1
 
 
 
 
 
 
 
 
 
f94a42e
ee79db1
 
f94a42e
14e17ef
f94a42e
14e17ef
f94a42e
14e17ef
 
 
 
 
f94a42e
 
14e17ef
 
ee79db1
f94a42e
 
 
 
 
14e17ef
 
f94a42e
ee79db1
14e17ef
f94a42e
14e17ef
ee79db1
 
f94a42e
14e17ef
f94a42e
14e17ef
ee79db1
 
 
14e17ef
f94a42e
 
 
 
 
 
14e17ef
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import streamlit as st
from transformers import pipeline
import pickle
import os
import pandas as pd
import ast
import string
import re
from sentence_transformers import SentenceTransformer, util

st.set_page_config(
	page_title="Offer Recommender",
	layout="wide"
)

# Download and cache models
pipe = pipeline(task="zero-shot-classification", model="valhalla/distilbart-mnli-12-3")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Directory of csv files
dire = "DS_NLP_search_data"

# Use Streamlit caching to load data once
@st.cache_data
def get_processed_offers():
	'''
	Load processed offers from exploration notebook and cache

	Returns:
		processed_offers (pd.DataFrame) :  zero-shot categorized offers
	'''
	processed_offers = pd.read_csv(os.path.join(dire, "processed_offers.csv"))
	processed_offers["CATEGORY"] = processed_offers["CATEGORY"].map(ast.literal_eval)

	return processed_offers


@st.cache_data
def get_categories_data():
	'''
	Load raw category data and cache

	Returns:
		cats (pd.DataFrame) :  raw category data
	'''

	cats = pd.read_csv(os.path.join(dire, "categories.csv"))

	return cats


@st.cache_data
def get_offers_data():
	'''
	Load raw offfers data and cache

	Returns:
		cats (pd.DataFrame) :  raw offers data
	'''

	offers = pd.read_csv(os.path.join(dire, "offer_retailer.csv"))

	return offers


@st.cache_data
def get_categories(cats_):
	'''
	Extract, load categories and cache

	Parameters:
		cats_ (pd.DataFrame) : raw categories data

	Returns:
		categories (List) :  child categories
	'''

	categories = list(cats_["IS_CHILD_CATEGORY_TO"].unique())
	for x in ["Mature"]:
		if x in categories:
			categories.remove(x)

	return categories


def check_in_offer(search_str, offer_rets):
	'''
	Determine if the input text is directly in the offer with basic string matching

	Parameters:
		search_str (string) : user text input
		offer_rets (pd.DataFrame) : raw offer data

	Returns:
		df (pd.DataFrame) :  offers with text input
	'''

	offers = []
	for i in range(len(offer_rets)):
		offer_str = offer_rets.iloc[i]["OFFER"]
		parsed_str = offer_str.lower().translate(str.maketrans('', '', string.punctuation))
		parsed_str = re.sub('[^a-zA-Z0-9 \n\.]', '', parsed_str)

		if search_str.lower() in parsed_str.split(" "):
		  offers.append(offer_str)
	df = pd.DataFrame({"OFFER":offers})

	return df


def is_retailer(search_str, threshold=0.5):
	'''
	Determine if the text input is highly likely to be a retailer

	Parameters:
		search_str (string) : user text input
		threshold (int) : probability threshold

	Returns:
		is_ret (boolean) :  true if retailer, false otherwise
	'''

	processed_search_str = search_str.lower().capitalize()
	labels = pipe(processed_search_str,
	  candidate_labels=["brand", "retailer", "item"],
	)

	is_ret = labels["labels"][0] == "retailer" and labels["scores"][0] > threshold

	return is_ret


def perform_cat_inference(search_str, categories, cats, processed_offers):
	'''
	Perform zero shot learning twice and return the offers relevant to the child categories

	Parameters:
		search_str (string) : user text input
		categories (pd.DataFrame) : list of categories
		cats (pd.DataFrame) : raw category data
		processed_offers (pd.DataFrame) : processed_offer_data

	Returns:
		offers (pd.DataFrame) : relevant offers  
		labels (dict) : parent categories and their probability scores
		labels_2 (dict) : child categories and their probability scores
	'''

	labels = pipe(search_str,
		candidate_labels=categories,
	)
	# labels = [l for i, l in enumerate(labels["labels"]) if labels["scores"][i] > 0.20]
	filtered_cats = list(cats[cats["IS_CHILD_CATEGORY_TO"].isin(labels["labels"][:3])]["PRODUCT_CATEGORY"].unique())
	labels_2 = pipe(search_str,
		candidate_labels=filtered_cats,
	)
	top_labels = labels_2["labels"][:3]
	offers = processed_offers[processed_offers["CATEGORY"].apply(lambda x: bool(set(x) & set(top_labels)))]["OFFER"].reset_index()

	return offers, labels, labels_2


def sort_by_similarity(search_str, related_offers):
	'''
	Use sentence embeddings to evaluate the similarity of relevant offers to the text input

	Parameters:
		search_str (string) : user text input
		related_offers (pd.DataFrame) : relevant offers discovered by zero shot learning

	Returns:
		df (pd.DataFrame) : relevant offers and their similiarity scores  
	'''

	temp_dict = {}
	embedding_1 = model.encode(search_str, convert_to_tensor=True)

	for offer in list(related_offers["OFFER"]):
		embedding_2 = model.encode(offer, convert_to_tensor=True)

		temp_dict[offer] = float(util.pytorch_cos_sim(embedding_1, embedding_2))

	sorted_dict = dict(sorted(temp_dict.items(), key=lambda x : x[1], reverse=True))
	df = pd.DataFrame({"OFFER":list(sorted_dict.keys())[:20], "scores":list(sorted_dict.values())[:20]})

	return df


def main():
	# Load and cache data
	col_1, col_2, col_3 = st.columns(3)
	search_str = col_1.text_input("Enter a retailer, brand, or category").capitalize()
	processed_offers = get_processed_offers()
	cats = get_categories_data()
	offer_rets = get_offers_data()
	categories = get_categories(cats)

	if col_1.button("Search", type="primary"):
		# Check offers where the text is directly in it
		retail = is_retailer(search_str)
		direct_offers = check_in_offer(search_str, offer_rets)
		col_2.write("Directly related offers")

		if len(direct_offers) == 0:
			col_2.write("None found")
		else:
			col_2.table(direct_offers)

		if retail:
			# If retail, we directly compare every offer using sentence embeddings
			related_offers = offer_rets[~offer_rets["OFFER"].isin(list(direct_offers["OFFER"]))]
		else:
			# Otherwise, we use zero shot learning with processed offers to narrow down our search
			related_offers, labels_1, labels_2 = perform_cat_inference(search_str, categories, cats, processed_offers) 
			related_offers = related_offers[~related_offers["OFFER"].isin(list(direct_offers["OFFER"]))]

			col_2.write("Parent categories probabilities")
			col_2.table(pd.DataFrame({"labels": labels_1["labels"][:5], "scores": labels_1["scores"][:5]}))
			col_2.write("Child categories probabilities")
			col_2.table(pd.DataFrame({"labels": labels_2["labels"][:5], "scores": labels_2["scores"][:5]}))
		
		col_2.write("Other related offers")
		sorted_offers = sort_by_similarity(search_str, related_offers)

		if len(sorted_offers) == 0:
			col_2.write("None found")
		else:
			col_2.table(sorted_offers)

if __name__ == "__main__":
	main()