12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- import random
- import requests
- import mechanicalsoup
- from time import sleep
- from django.conf import settings
- pages_start = [i*10 for i in range(10)]
- PARAMETER = '?sort_by=date_desc&start='
- BROWSER = getattr(settings, 'BROWSER_URI')
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Upgrade-Insecure-Requests': '1',
- 'Connection': 'keep-alive'
- }
- def scrape_reviews(location_url, n_pages):
- start_params = pages_start[:n_pages]
- yelp_reviews = []
- for start in start_params:
- url = location_url + PARAMETER + str(start)
- browser = mechanicalsoup.StatefulBrowser()
- browser.open(url)
- # pause for 60-120 to make request more human-like.
- sleep(random.randint(30, 60))
- html_soup = browser.page
- review_items = html_soup.findAll('li', class_='margin-b5__373c0__2ErL8')
- # A single review content
- if not review_items:
- return None
- for review in review_items:
- user_detail_div = review.find('div', class_='user-passport-info')
- if not user_detail_div:
- continue
- name = user_detail_div.a.getText()
- profile_link = user_detail_div.a.attrs.get('href')
- rating_div = review.find('div', class_='i-stars__373c0__1T6rz')
- rating = int(rating_div.attrs.get('aria-label').split()[0])
- date_posted = review.find('span', class_='css-e81eai').getText()
- comment_p = review.find('p', class_='comment__373c0__1M-px css-n6i4z7')
- review_text = comment_p.span.getText()
- review_dict = {
- 'name': name,
- 'profile': profile_link,
- 'rating': rating,
- 'date_posted': date_posted,
- 'comment': review_text,
- }
- yelp_reviews.append(review_dict)
- return yelp_reviews
- def scrape_reviews_using_browser(location_url):
- url = BROWSER + f'/yelp/reviews?url={location_url}'
- response = requests.get(url).json()
- return response.get('reviews')
|