1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import requests
- import random
- from time import sleep
- from bs4 import BeautifulSoup as bSoup
- from .utils import date_string2timezone
- pages_start = [i*20 for i in range(10)]
- PARAMETER = '?sort_by=date_desc&start='
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en-US,en;q=0.5',
- 'Accept-Encoding': 'gzip, deflate, br',
- 'Upgrade-Insecure-Requests': '1',
- 'Connection': 'keep-alive'
- }
- def scrape_reviews(location_url, max_date, n_pages):
- start_params = pages_start[:n_pages]
- yelp_reviews = []
- for start in start_params:
- url = location_url + PARAMETER + str(start)
- response = requests.get(url, headers=headers).text
- # pause for 60-120 to make request more human-like.
- sleep(random.randint(30, 60))
- html_soup = bSoup(response, 'html.parser')
- review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
- # A single review content
- if not review_items:
- return None
- for review_content in review_items:
- profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
- name = profile.getText()
- profile_link = profile.get('href')
- rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
- rating = int(rating_div.div.get('aria-label').split()[0])
- date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
- date = date_string2timezone(date_posted)
- review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
- if date > max_date:
- review = {
- 'name': name,
- 'profile': profile_link,
- 'rating': rating,
- 'date_posted': date,
- 'comment': review_text,
- }
- yelp_reviews.append(review)
- return yelp_reviews
|