Spaces:

OEvortex
/

Webscout-API

Running

App Files Files Community

KingNish commited on Jul 18

Commit

cb2edc5

•

1 Parent(s): 0e44db5

Update app.py

Browse files

Files changed (1) hide show

app.py +9 -23

app.py CHANGED Viewed

@@ -177,13 +177,7 @@ async def chat(
 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
-    soup = BeautifulSoup(html_content, "html.parser")
-    # Remove unwanted tags
-    for tag in soup(["script", "style", "header", "footer", "nav"]):
-        tag.extract()
-    # Get the remaining visible text
-    visible_text = soup.get_text(strip=True)
-    return visible_text
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text asynchronously."""
@@ -248,19 +242,12 @@ async def web_search_and_extract(
 def extract_text_from_webpage2(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
-    soup = BeautifulSoup(html_content, "html.parser")
-    # Remove unwanted tags
-    for tag in soup(["script", "style", "header", "footer", "nav"]):
-        tag.extract()
-    # Get the remaining visible text
-    visible_text = soup.get_text(strip=True)
-    return visible_text
-def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text using threading."""
-    proxies = {'http': proxy, 'https': proxy} if proxy else None
     try:
-        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
         response.raise_for_status()
         html_content = response.text
         visible_text = extract_text_from_webpage2(html_content)
@@ -274,14 +261,13 @@ def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
     q: str,
-    max_results: int = 3,
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
     backend: str = "html",
-    max_chars: int = 6000,
-    extract_only: bool = True,
-    proxy: Optional[str] = None
 ):
     """
     Searches using WEBS, extracts text from the top results using threading, and returns both.
@@ -297,7 +283,7 @@ def web_search_and_extract_threading(
             threads = []
             for result in search_results:
                 if 'href' in result:
-                    thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
                     threads.append(thread)
                     thread.start()

 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
+    return BeautifulSoup(html_content).get_text(strip=True)
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text asynchronously."""
 def extract_text_from_webpage2(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
+    return BeautifulSoup(html_content).get_text(strip=True)
+def fetch_and_extract2(url, max_chars):
     """Fetches a URL and extracts text using threading."""
     try:
+        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
         response.raise_for_status()
         html_content = response.text
         visible_text = extract_text_from_webpage2(html_content)
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
     q: str,
+    max_results: int = 10,
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
     backend: str = "html",
+    max_chars: int = 10000,
+    extract_only: bool = True
 ):
     """
     Searches using WEBS, extracts text from the top results using threading, and returns both.
             threads = []
             for result in search_results:
                 if 'href' in result:
+                    thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
                     threads.append(thread)
                     thread.start()