|
@@ -1,10 +1,10 @@
|
|
-import requests
|
|
|
|
import random
|
|
import random
|
|
|
|
+import requests
|
|
|
|
+import mechanicalsoup
|
|
from time import sleep
|
|
from time import sleep
|
|
-from bs4 import BeautifulSoup as bSoup
|
|
|
|
from django.conf import settings
|
|
from django.conf import settings
|
|
|
|
|
|
-pages_start = [i*20 for i in range(10)]
|
|
|
|
|
|
+pages_start = [i*10 for i in range(10)]
|
|
|
|
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
|
|
|
|
@@ -21,39 +21,41 @@ headers = {
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
-def scrape_reviews(location_url, max_date, n_pages):
|
|
|
|
|
|
+def scrape_reviews(location_url, n_pages):
|
|
start_params = pages_start[:n_pages]
|
|
start_params = pages_start[:n_pages]
|
|
yelp_reviews = []
|
|
yelp_reviews = []
|
|
|
|
|
|
for start in start_params:
|
|
for start in start_params:
|
|
url = location_url + PARAMETER + str(start)
|
|
url = location_url + PARAMETER + str(start)
|
|
- response = requests.get(url, headers=headers).text
|
|
|
|
|
|
+ browser = mechanicalsoup.StatefulBrowser()
|
|
|
|
+ browser.open(url)
|
|
|
|
|
|
# pause for 60-120 to make request more human-like.
|
|
# pause for 60-120 to make request more human-like.
|
|
sleep(random.randint(30, 60))
|
|
sleep(random.randint(30, 60))
|
|
- html_soup = bSoup(response, 'html.parser')
|
|
|
|
|
|
+ html_soup = browser.page
|
|
|
|
|
|
- review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
|
|
|
|
|
|
+ review_items = html_soup.findAll('li', class_='margin-b5__373c0__2ErL8')
|
|
|
|
|
|
# A single review content
|
|
# A single review content
|
|
if not review_items:
|
|
if not review_items:
|
|
return None
|
|
return None
|
|
- for review_content in review_items:
|
|
|
|
- profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
|
|
|
|
- name = profile.getText()
|
|
|
|
- profile_link = profile.get('href')
|
|
|
|
- rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
|
|
- rating = int(rating_div.div.get('aria-label').split()[0])
|
|
|
|
- date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
|
|
- review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
|
|
|
|
- review = {
|
|
|
|
|
|
+ for review in review_items:
|
|
|
|
+ user_detail_div = review.find('div', class_='user-passport-info')
|
|
|
|
+ name = user_detail_div.a.getText()
|
|
|
|
+ profile_link = 'https://www.yelp.com' + user_detail_div.a.attrs.get('href')
|
|
|
|
+ rating_div = review.find('div', class_='i-stars__373c0__1T6rz')
|
|
|
|
+ rating = int(rating_div.attrs.get('aria-label').split()[0])
|
|
|
|
+ date_posted = review.find('span', class_='css-e81eai').getText()
|
|
|
|
+ comment_p = review.find('p', class_='comment__373c0__1M-px css-n6i4z7')
|
|
|
|
+ review_text = comment_p.span.getText()
|
|
|
|
+ review_dict = {
|
|
'name': name,
|
|
'name': name,
|
|
'profile': profile_link,
|
|
'profile': profile_link,
|
|
'rating': rating,
|
|
'rating': rating,
|
|
'date_posted': date_posted,
|
|
'date_posted': date_posted,
|
|
'comment': review_text,
|
|
'comment': review_text,
|
|
}
|
|
}
|
|
- yelp_reviews.append(review)
|
|
|
|
|
|
+ yelp_reviews.append(review_dict)
|
|
return yelp_reviews
|
|
return yelp_reviews
|
|
|
|
|
|
|
|
|