booking_reviews_chatgpt_summary / selenium_parser.py
max-unfinity
chatgpt
9beb013
raw
history blame contribute delete
No virus
3.02 kB
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
def load_driver():
print("Loading driver...")
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
print("OK.")
return driver
def parse_review(html):
# Review text
soup = BeautifulSoup(html, 'html.parser')
positive_review = None
negative_review = None
rows = soup.find_all("div", class_="c-review__row")
for row in rows:
if row.find("span", class_="c-review__translation-loader"):
continue
delimiter = row.find("span", class_="bui-u-sr-only").text.strip()
review_text = row.find("span", class_='c-review__body').text.strip()
if delimiter == "Понравилось":
positive_review = review_text
elif delimiter == "Не понравилось":
negative_review = review_text
else:
raise ValueError()
# Room name
room_info = soup.find('div', class_='c-review-block__room-info-row')
room_name = room_info.find('div', class_='bui-list__body').get_text(strip=True) if room_info else None
# Datetime of the review
# datetime_review = soup.find('span', class_='c-review-block__date').get_text(strip=True)
# Number of nights + date
stay_date_info = soup.find('ul', class_='c-review-block__stay-date')
date_info = stay_date_info.get_text(strip=True).replace(" ·", ", ")
return {
"positive": positive_review,
"negative": negative_review,
"room": room_name,
"time": date_info
}
def scrape_page(driver: webdriver.Firefox, url: str, page_count: int = 5, wait_time: int = 3):
# url = "https://www.booking.com/hotel/th/queen-boutique.ru.html#tab-reviews"
review_infos = []
driver.get(url)
for i in range(page_count):
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "review_list_new_item_block")))
time.sleep(wait_time)
# Remove cookie banner
try:
driver.execute_script("return document.getElementById('onetrust-banner-sdk').remove();")
except:
pass
elems = driver.find_elements(By.CLASS_NAME, "review_list_new_item_block")
for elem in elems:
html = elem.get_attribute('outerHTML')
review_info = parse_review(html)
review_infos.append(review_info)
print(f"Done page {i+1} of {page_count}")
pagenext = driver.find_element(By.CLASS_NAME, "pagenext")
pagenext.click()
return review_infos
def extract_url(url: str):
if "?" in url:
url = url.split("?")[0]
url += "#tab-reviews"
return url