KingNish commited on
Commit
cb2edc5
1 Parent(s): 0e44db5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -23
app.py CHANGED
@@ -177,13 +177,7 @@ async def chat(
177
 
178
  def extract_text_from_webpage(html_content):
179
  """Extracts visible text from HTML content using BeautifulSoup."""
180
- soup = BeautifulSoup(html_content, "html.parser")
181
- # Remove unwanted tags
182
- for tag in soup(["script", "style", "header", "footer", "nav"]):
183
- tag.extract()
184
- # Get the remaining visible text
185
- visible_text = soup.get_text(strip=True)
186
- return visible_text
187
 
188
  async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
189
  """Fetches a URL and extracts text asynchronously."""
@@ -248,19 +242,12 @@ async def web_search_and_extract(
248
 
249
  def extract_text_from_webpage2(html_content):
250
  """Extracts visible text from HTML content using BeautifulSoup."""
251
- soup = BeautifulSoup(html_content, "html.parser")
252
- # Remove unwanted tags
253
- for tag in soup(["script", "style", "header", "footer", "nav"]):
254
- tag.extract()
255
- # Get the remaining visible text
256
- visible_text = soup.get_text(strip=True)
257
- return visible_text
258
-
259
- def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
260
  """Fetches a URL and extracts text using threading."""
261
- proxies = {'http': proxy, 'https': proxy} if proxy else None
262
  try:
263
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
264
  response.raise_for_status()
265
  html_content = response.text
266
  visible_text = extract_text_from_webpage2(html_content)
@@ -274,14 +261,13 @@ def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
274
  @app.get("/api/websearch-and-extract-threading")
275
  def web_search_and_extract_threading(
276
  q: str,
277
- max_results: int = 3,
278
  timelimit: Optional[str] = None,
279
  safesearch: str = "moderate",
280
  region: str = "wt-wt",
281
  backend: str = "html",
282
- max_chars: int = 6000,
283
- extract_only: bool = True,
284
- proxy: Optional[str] = None
285
  ):
286
  """
287
  Searches using WEBS, extracts text from the top results using threading, and returns both.
@@ -297,7 +283,7 @@ def web_search_and_extract_threading(
297
  threads = []
298
  for result in search_results:
299
  if 'href' in result:
300
- thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
301
  threads.append(thread)
302
  thread.start()
303
 
 
177
 
178
  def extract_text_from_webpage(html_content):
179
  """Extracts visible text from HTML content using BeautifulSoup."""
180
+ return BeautifulSoup(html_content).get_text(strip=True)
 
 
 
 
 
 
181
 
182
  async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
183
  """Fetches a URL and extracts text asynchronously."""
 
242
 
243
  def extract_text_from_webpage2(html_content):
244
  """Extracts visible text from HTML content using BeautifulSoup."""
245
+ return BeautifulSoup(html_content).get_text(strip=True)
246
+
247
+ def fetch_and_extract2(url, max_chars):
 
 
 
 
 
 
248
  """Fetches a URL and extracts text using threading."""
 
249
  try:
250
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
251
  response.raise_for_status()
252
  html_content = response.text
253
  visible_text = extract_text_from_webpage2(html_content)
 
261
  @app.get("/api/websearch-and-extract-threading")
262
  def web_search_and_extract_threading(
263
  q: str,
264
+ max_results: int = 10,
265
  timelimit: Optional[str] = None,
266
  safesearch: str = "moderate",
267
  region: str = "wt-wt",
268
  backend: str = "html",
269
+ max_chars: int = 10000,
270
+ extract_only: bool = True
 
271
  ):
272
  """
273
  Searches using WEBS, extracts text from the top results using threading, and returns both.
 
283
  threads = []
284
  for result in search_results:
285
  if 'href' in result:
286
+ thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
287
  threads.append(thread)
288
  thread.start()
289