import random import requests import mechanicalsoup from time import sleep from django.conf import settings pages_start = [i*10 for i in range(10)] PARAMETER = '?sort_by=date_desc&start=' BROWSER = getattr(settings, 'BROWSER_URI') headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Upgrade-Insecure-Requests': '1', 'Connection': 'keep-alive' } def scrape_reviews(location_url, n_pages): start_params = pages_start[:n_pages] yelp_reviews = [] for start in start_params: url = location_url + PARAMETER + str(start) browser = mechanicalsoup.StatefulBrowser() browser.open(url) # pause for 60-120 to make request more human-like. sleep(random.randint(30, 60)) html_soup = browser.page review_items = html_soup.findAll('li', class_='margin-b5__373c0__2ErL8') # A single review content if not review_items: return None for review in review_items: user_detail_div = review.find('div', class_='user-passport-info') if not user_detail_div: continue name = user_detail_div.a.getText() profile_link = user_detail_div.a.attrs.get('href') rating_div = review.find('div', class_='i-stars__373c0__1T6rz') rating = int(rating_div.attrs.get('aria-label').split()[0]) date_posted = review.find('span', class_='css-e81eai').getText() comment_p = review.find('p', class_='comment__373c0__1M-px css-n6i4z7') review_text = comment_p.span.getText() review_dict = { 'name': name, 'profile': profile_link, 'rating': rating, 'date_posted': date_posted, 'comment': review_text, } yelp_reviews.append(review_dict) return yelp_reviews def scrape_reviews_using_browser(location_url): url = BROWSER + f'/yelp/reviews?url={location_url}' response = requests.get(url).json() return response.get('reviews')