from selenium import webdriver from selenium.webdriver import FirefoxOptions from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time def load_driver(): print("Loading driver...") opts = FirefoxOptions() opts.add_argument("--headless") driver = webdriver.Firefox(options=opts) return driver def parse_review(html): # Review text soup = BeautifulSoup(html, 'html.parser') positive_review = None negative_review = None rows = soup.find_all("div", class_="c-review__row") for row in rows: if row.find("span", class_="c-review__translation-loader"): continue delimiter = row.find("span", class_="bui-u-sr-only").text.strip() review_text = row.find("span", class_='c-review__body').text.strip() if delimiter == "Понравилось": positive_review = review_text elif delimiter == "Не понравилось": negative_review = review_text else: raise ValueError() # Room name room_info = soup.find('div', class_='c-review-block__room-info-row') room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None # Datetime of the review # datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True) # Number of nights + date stay_date_info = soup.find('ul', class_='c-review-block__stay-date') date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ") return { "positive": positive_review, "negative": negative_review, "room": room_name, "time": date_info } def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 1): # url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews" review_infos = [] driver.get(url) print("page loaded") for i in range(page_count): WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block"))) time.sleep(wait_time) # Remove cookie banner try: driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();") except: pass elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block") for elem in elems: html = elem.get_attribute('outerHTML') review_info = parse_review(html) review_infos.append(review_info) print(f"Done page {i+1} of {page_count}") pagenext = driver.find_element(By.CLASS_NAME, "pagenext") pagenext.click() return review_infos