KingNish commited on
Commit
b61ee91
1 Parent(s): 2b2b242

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py CHANGED
@@ -8,6 +8,7 @@ import requests
8
  import urllib.parse
9
  import asyncio
10
  import aiohttp
 
11
  from typing import List
12
 
13
  app = FastAPI()
@@ -213,6 +214,71 @@ async def web_search_and_extract(
213
  except Exception as e:
214
  raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  @app.get("/api/adv_web_search")
217
  async def adv_web_search(
218
  q: str,
 
8
  import urllib.parse
9
  import asyncio
10
  import aiohttp
11
+ import threading
12
  from typing import List
13
 
14
  app = FastAPI()
 
214
  except Exception as e:
215
  raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
216
 
217
+ def extract_text_from_webpage2(html_content):
218
+ """Extracts visible text from HTML content using BeautifulSoup."""
219
+ soup = BeautifulSoup(html_content, "html.parser")
220
+ # Remove unwanted tags
221
+ for tag in soup(["script", "style", "header", "footer", "nav"]):
222
+ tag.extract()
223
+ # Get the remaining visible text
224
+ visible_text = soup.get_text(strip=True)
225
+ return visible_text
226
+
227
+ def fetch_and_extract2(url, max_chars):
228
+ """Fetches a URL and extracts text using threading."""
229
+ try:
230
+ response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
231
+ response.raise_for_status()
232
+ html_content = response.text
233
+ visible_text = extract_text_from_webpage2(html_content)
234
+ if len(visible_text) > max_chars:
235
+ visible_text = visible_text[:max_chars] + "..."
236
+ return {"link": url, "text": visible_text}
237
+ except (requests.exceptions.RequestException) as e:
238
+ print(f"Error fetching or processing {url}: {e}")
239
+ return {"link": url, "text": None}
240
+
241
+ @app.get("/api/websearch-and-extract-threading")
242
+ def web_search_and_extract_threading(
243
+ q: str,
244
+ max_results: int = 3,
245
+ timelimit: Optional[str] = None,
246
+ safesearch: str = "moderate",
247
+ region: str = "wt-wt",
248
+ backend: str = "html",
249
+ max_chars: int = 6000,
250
+ extract_only: bool = True
251
+ ):
252
+ """
253
+ Searches using WEBS, extracts text from the top results using threading, and returns both.
254
+ """
255
+ try:
256
+ with WEBS() as webs:
257
+ # Perform WEBS search
258
+ search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
259
+ timelimit=timelimit, backend=backend, max_results=max_results)
260
+
261
+ # Extract text from each result's link using threading
262
+ extracted_results = []
263
+ threads = []
264
+ for result in search_results:
265
+ if 'href' in result:
266
+ thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
267
+ threads.append(thread)
268
+ thread.start()
269
+
270
+ # Wait for all threads to finish
271
+ for thread in threads:
272
+ thread.join()
273
+
274
+ if extract_only:
275
+ return JSONResponse(content=jsonable_encoder(extracted_results))
276
+ else:
277
+ return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
278
+ except Exception as e:
279
+ raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
280
+
281
+
282
  @app.get("/api/adv_web_search")
283
  async def adv_web_search(
284
  q: str,