|
@@ -2,13 +2,14 @@ import requests
|
|
import random
|
|
import random
|
|
from time import sleep
|
|
from time import sleep
|
|
from bs4 import BeautifulSoup as bSoup
|
|
from bs4 import BeautifulSoup as bSoup
|
|
-
|
|
|
|
-from .utils import date_string2timezone
|
|
|
|
|
|
+from django.conf import settings
|
|
|
|
|
|
pages_start = [i*20 for i in range(10)]
|
|
pages_start = [i*20 for i in range(10)]
|
|
|
|
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
|
|
|
|
|
|
+BROWSER = getattr(settings, 'BROWSER_URI')
|
|
|
|
+
|
|
|
|
|
|
headers = {
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
|
|
@@ -34,7 +35,6 @@ def scrape_reviews(location_url, max_date, n_pages):
|
|
|
|
|
|
review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
|
|
review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
|
|
|
|
|
|
-
|
|
|
|
# A single review content
|
|
# A single review content
|
|
if not review_items:
|
|
if not review_items:
|
|
return None
|
|
return None
|
|
@@ -45,16 +45,19 @@ def scrape_reviews(location_url, max_date, n_pages):
|
|
rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
rating = int(rating_div.div.get('aria-label').split()[0])
|
|
rating = int(rating_div.div.get('aria-label').split()[0])
|
|
date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
- date = date_string2timezone(date_posted)
|
|
|
|
review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
|
|
review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
|
|
-
|
|
|
|
- if date > max_date:
|
|
|
|
- review = {
|
|
|
|
- 'name': name,
|
|
|
|
- 'profile': profile_link,
|
|
|
|
- 'rating': rating,
|
|
|
|
- 'date_posted': date,
|
|
|
|
- 'comment': review_text,
|
|
|
|
- }
|
|
|
|
- yelp_reviews.append(review)
|
|
|
|
|
|
+ review = {
|
|
|
|
+ 'name': name,
|
|
|
|
+ 'profile': profile_link,
|
|
|
|
+ 'rating': rating,
|
|
|
|
+ 'date_posted': date_posted,
|
|
|
|
+ 'comment': review_text,
|
|
|
|
+ }
|
|
|
|
+ yelp_reviews.append(review)
|
|
return yelp_reviews
|
|
return yelp_reviews
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def scrape_reviews_using_browser(location_url):
|
|
|
|
+ url = BROWSER + f'/yelp/reviews?url={location_url}'
|
|
|
|
+ response = requests.get(url).json()
|
|
|
|
+ return response.get('reviews')
|