KingNish commited on
Commit
7c669e9
1 Parent(s): cdaa80d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -47
app.py CHANGED
@@ -366,55 +366,53 @@ def extract_text_from_webpage(html_content):
366
  return visible_text
367
 
368
  # Perform a Google search and return the results
369
- def search(term, num_results=3, lang="en", advanced=True, sleep_interval=0, timeout=5, safe="active", ssl_verify=None):
370
  """Performs a Google search and returns the results."""
371
- # Ensure term is a string before parsing
372
- if isinstance(term, dict):
373
- term = term.get('text', '') # Get text from user_prompt or default to empty string
374
- escaped_term = urllib.parse.quote_plus(term)
375
  start = 0
376
  all_results = []
377
- # Fetch results in batches
378
- while start < num_results:
379
- resp = requests.get(
380
- url="https://www.google.com/search",
381
- headers={"User-Agent": get_useragent()}, # Set random user agent
382
- params={
383
- "q": term,
384
- "num": num_results - start, # Number of results to fetch in this batch
385
- "hl": lang,
386
- "start": start,
387
- "safe": safe,
388
- },
389
- timeout=timeout,
390
- verify=ssl_verify,
391
- )
392
- resp.raise_for_status() # Raise an exception if request fails
393
- soup = BeautifulSoup(resp.text, "html.parser")
394
- result_block = soup.find_all("div", attrs={"class": "g"})
395
- # If no results, continue to the next batch
396
- if not result_block:
397
- start += 1
398
- continue
399
- # Extract link and text from each result
400
- for result in result_block:
401
- link = result.find("a", href=True)
402
- if link:
403
- link = link["href"]
404
- try:
405
- # Fetch webpage content
406
- webpage = requests.get(link, headers={"User-Agent": get_useragent()})
407
- webpage.raise_for_status()
408
- # Extract visible text from webpage
409
- visible_text = extract_text_from_webpage(webpage.text)
410
- all_results.append({"link": link, "text": visible_text})
411
- except requests.exceptions.RequestException as e:
412
- # Handle errors fetching or processing webpage
413
- print(f"Error fetching or processing {link}: {e}")
414
- all_results.append({"link": link, "text": None})
415
- else:
416
- all_results.append({"link": None, "text": None})
417
- start += len(result_block) # Update starting index for next batch
 
418
  return all_results
419
 
420
  # Format the prompt for the language model
@@ -455,7 +453,7 @@ def model_inference(
455
  web_results = search(user_prompt["text"])
456
  web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
457
  # Load the language model
458
- client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.2")
459
  generate_kwargs = dict(
460
  max_new_tokens=4000,
461
  do_sample=True,
 
366
  return visible_text
367
 
368
  # Perform a Google search and return the results
369
+ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="active", ssl_verify=None):
370
  """Performs a Google search and returns the results."""
371
+ escaped_term = urllib.parse.quote_plus(term)
 
 
 
372
  start = 0
373
  all_results = []
374
+ # Limit the number of characters from each webpage to stay under the token limit
375
+ max_chars_per_page = 10000 # Adjust this value based on your token limit and average webpage length
376
+
377
+ with requests.Session() as session:
378
+ while start < num_results:
379
+ resp = session.get(
380
+ url="https://www.google.com/search",
381
+ headers={"User-Agent": get_useragent()},
382
+ params={
383
+ "q": term,
384
+ "num": num_results - start,
385
+ "hl": lang,
386
+ "start": start,
387
+ "safe": safe,
388
+ },
389
+ timeout=timeout,
390
+ verify=ssl_verify,
391
+ )
392
+ resp.raise_for_status()
393
+ soup = BeautifulSoup(resp.text, "html.parser")
394
+ result_block = soup.find_all("div", attrs={"class": "g"})
395
+ if not result_block:
396
+ start += 1
397
+ continue
398
+ for result in result_block:
399
+ link = result.find("a", href=True)
400
+ if link:
401
+ link = link["href"]
402
+ try:
403
+ webpage = session.get(link, headers={"User-Agent": get_useragent()})
404
+ webpage.raise_for_status()
405
+ visible_text = extract_text_from_webpage(webpage.text)
406
+ # Truncate text if it's too long
407
+ if len(visible_text) > max_chars_per_page:
408
+ visible_text = visible_text[:max_chars_per_page] + "..."
409
+ all_results.append({"link": link, "text": visible_text})
410
+ except requests.exceptions.RequestException as e:
411
+ print(f"Error fetching or processing {link}: {e}")
412
+ all_results.append({"link": link, "text": None})
413
+ else:
414
+ all_results.append({"link": None, "text": None})
415
+ start += len(result_block)
416
  return all_results
417
 
418
  # Format the prompt for the language model
 
453
  web_results = search(user_prompt["text"])
454
  web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
455
  # Load the language model
456
+ client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.3")
457
  generate_kwargs = dict(
458
  max_new_tokens=4000,
459
  do_sample=True,