usmanyousaf commited on
Commit
59e7319
1 Parent(s): de916e4

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +14 -11
scrape.py CHANGED
@@ -1,23 +1,23 @@
1
  from selenium import webdriver
2
- from selenium.webdriver.chrome.service import Service
3
- from selenium.webdriver.chrome.options import Options
 
4
  from bs4 import BeautifulSoup
5
- from dotenv import load_dotenv
6
- import os
7
  import time
8
 
9
- load_dotenv()
10
 
11
- CHROME_DRIVER_PATH = os.getenv("./chromedriver")
 
 
 
 
 
 
12
 
13
  def scrape_website(website):
14
  print("Connecting to Chrome Browser...")
15
 
16
- # Setup ChromeDriver service and options
17
- service = Service(CHROME_DRIVER_PATH)
18
- options = Options()
19
- driver = webdriver.Chrome(service=service, options=options)
20
-
21
  try:
22
  driver.get(website)
23
  print("Waiting for CAPTCHA to be solved manually (if present)...")
@@ -44,9 +44,11 @@ def extract_body_content(html_content):
44
  def clean_body_content(body_content):
45
  soup = BeautifulSoup(body_content, "html.parser")
46
 
 
47
  for script_or_style in soup(["script", "style"]):
48
  script_or_style.extract()
49
 
 
50
  cleaned_content = soup.get_text(separator="\n")
51
  cleaned_content = "\n".join(
52
  line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -55,6 +57,7 @@ def clean_body_content(body_content):
55
  return cleaned_content
56
 
57
  def split_dom_content(dom_content, max_length=6000):
 
58
  return [
59
  dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
60
  ]
 
1
  from selenium import webdriver
2
+ from webdriver_manager.chrome import ChromeDriverManager
3
+ from selenium.webdriver.chrome.service import Service
4
+ from selenium.webdriver.chrome.options import Options
5
  from bs4 import BeautifulSoup
 
 
6
  import time
7
 
8
+ # No need for explicit CHROME_DRIVER_PATH or .env usage, WebDriverManager handles it.
9
 
10
+ options = Options()
11
+ options.add_argument("--headless")
12
+ options.add_argument("--no-sandbox")
13
+ options.add_argument("--disable-dev-shm-usage")
14
+
15
+ # Use WebDriverManager to automatically download and install the correct version
16
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
17
 
18
  def scrape_website(website):
19
  print("Connecting to Chrome Browser...")
20
 
 
 
 
 
 
21
  try:
22
  driver.get(website)
23
  print("Waiting for CAPTCHA to be solved manually (if present)...")
 
44
  def clean_body_content(body_content):
45
  soup = BeautifulSoup(body_content, "html.parser")
46
 
47
+ # Remove all <script> and <style> elements
48
  for script_or_style in soup(["script", "style"]):
49
  script_or_style.extract()
50
 
51
+ # Extract and clean text
52
  cleaned_content = soup.get_text(separator="\n")
53
  cleaned_content = "\n".join(
54
  line.strip() for line in cleaned_content.splitlines() if line.strip()
 
57
  return cleaned_content
58
 
59
  def split_dom_content(dom_content, max_length=6000):
60
+ # Split the content into chunks of max_length characters
61
  return [
62
  dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
63
  ]