KingNish commited on
Commit
0fe027e
1 Parent(s): c36d9d7

Update chatbot.py

Browse files
Files changed (1) hide show
  1. chatbot.py +12 -31
chatbot.py CHANGED
@@ -23,6 +23,7 @@ from transformers import AutoProcessor
23
  from huggingface_hub import InferenceClient
24
  from PIL import Image
25
  import spaces
 
26
 
27
  # Set device to CUDA if available, otherwise CPU
28
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -222,26 +223,8 @@ def extract_images_from_msg_list(msg_list):
222
  all_images.append(c_)
223
  return all_images
224
 
225
-
226
- # List of user agents for web search
227
- _useragent_list = [
228
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
229
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
230
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
231
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
232
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
233
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
234
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
235
- ]
236
-
237
-
238
- # Get a random user agent from the list
239
- def get_useragent():
240
- """Returns a random user agent from the list."""
241
- return random.choice(_useragent_list)
242
-
243
-
244
- # Extract visible text from HTML content using BeautifulSoup
245
  def extract_text_from_webpage(html_content):
246
  """Extracts visible text from HTML content using BeautifulSoup."""
247
  soup = BeautifulSoup(html_content, "html.parser")
@@ -252,24 +235,23 @@ def extract_text_from_webpage(html_content):
252
  visible_text = soup.get_text(strip=True)
253
  return visible_text
254
 
255
-
256
  # Perform a Google search and return the results
257
- def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
258
  """Performs a Google search and returns the results."""
259
- escaped_term = urllib.parse.quote_plus(term)
260
  start = 0
261
  all_results = []
262
  # Limit the number of characters from each webpage to stay under the token limit
263
  max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
264
-
265
- with requests.Session() as session:
266
  while start < num_results:
267
- resp = session.get(
268
  url="https://www.google.com/search",
269
- headers={"User-Agent": get_useragent()},
270
  params={
271
  "q": term,
272
- "num": num_results - start,
273
  "hl": lang,
274
  "start": start,
275
  "safe": safe,
@@ -288,7 +270,7 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
288
  if link:
289
  link = link["href"]
290
  try:
291
- webpage = session.get(link, headers={"User-Agent": get_useragent()})
292
  webpage.raise_for_status()
293
  visible_text = extract_text_from_webpage(webpage.text)
294
  # Truncate text if it's too long
@@ -300,10 +282,9 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
300
  all_results.append({"link": link, "text": None})
301
  else:
302
  all_results.append({"link": None, "text": None})
303
- start += len(result_block)
304
  return all_results
305
 
306
-
307
  # Format the prompt for the language model
308
  def format_prompt(user_prompt, chat_history):
309
  prompt = "<s>"
 
23
  from huggingface_hub import InferenceClient
24
  from PIL import Image
25
  import spaces
26
+ from functools import lru_cache
27
 
28
  # Set device to CUDA if available, otherwise CPU
29
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
223
  all_images.append(c_)
224
  return all_images
225
 
226
+ # Perform a Google search and return the results
227
+ @lru_cache(maxsize=128)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  def extract_text_from_webpage(html_content):
229
  """Extracts visible text from HTML content using BeautifulSoup."""
230
  soup = BeautifulSoup(html_content, "html.parser")
 
235
  visible_text = soup.get_text(strip=True)
236
  return visible_text
237
 
 
238
  # Perform a Google search and return the results
239
+ def search(term, num_results=2, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
240
  """Performs a Google search and returns the results."""
241
+ escaped_term = urllib.parse.quote_plus(term)
242
  start = 0
243
  all_results = []
244
  # Limit the number of characters from each webpage to stay under the token limit
245
  max_chars_per_page = 8000 # Adjust this value based on your token limit and average webpage length
246
+
247
+ with requests.Session() as session:
248
  while start < num_results:
249
+ resp = session.get(
250
  url="https://www.google.com/search",
251
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"},
252
  params={
253
  "q": term,
254
+ "num": num_results - start,
255
  "hl": lang,
256
  "start": start,
257
  "safe": safe,
 
270
  if link:
271
  link = link["href"]
272
  try:
273
+ webpage = session.get(link, headers={"User-Agent": get_useragent()})
274
  webpage.raise_for_status()
275
  visible_text = extract_text_from_webpage(webpage.text)
276
  # Truncate text if it's too long
 
282
  all_results.append({"link": link, "text": None})
283
  else:
284
  all_results.append({"link": None, "text": None})
285
+ start += len(result_block)
286
  return all_results
287
 
 
288
  # Format the prompt for the language model
289
  def format_prompt(user_prompt, chat_history):
290
  prompt = "<s>"