|
@@ -1,27 +1,19 @@
|
|
import requests
|
|
import requests
|
|
-import datetime
|
|
|
|
-from django.utils import timezone
|
|
|
|
from bs4 import BeautifulSoup as bSoup
|
|
from bs4 import BeautifulSoup as bSoup
|
|
|
|
|
|
|
|
+from .utils import date_string2timezone
|
|
|
|
+
|
|
pages_start = [i*20 for i in range(10)]
|
|
pages_start = [i*20 for i in range(10)]
|
|
|
|
|
|
-BASE_URL = 'https://www.yelp.com/biz/signaturecare-emergency-center-montrose-houston-2'
|
|
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
|
|
|
|
|
|
|
|
-def date_string2timezone(date):
|
|
|
|
- month, day, year = map(int, date.split('/'))
|
|
|
|
- date = datetime.datetime(day=day, month=month, year=year)
|
|
|
|
- time_zone = timezone.make_aware(date)
|
|
|
|
- return time_zone
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-def scrape_reviews(location, n_pages):
|
|
|
|
|
|
+def scrape_reviews(location_url, max_date, n_pages):
|
|
start_params = pages_start[:n_pages]
|
|
start_params = pages_start[:n_pages]
|
|
|
|
+ yelp_reviews = []
|
|
|
|
|
|
- review_collection = []
|
|
|
|
for start in start_params:
|
|
for start in start_params:
|
|
- url = BASE_URL + PARAMETER + str(start)
|
|
|
|
|
|
+ url = location_url + PARAMETER + str(start)
|
|
response = requests.get(url).text
|
|
response = requests.get(url).text
|
|
html_soup = bSoup(response, 'html.parser')
|
|
html_soup = bSoup(response, 'html.parser')
|
|
|
|
|
|
@@ -34,15 +26,17 @@ def scrape_reviews(location, n_pages):
|
|
profile_link = profile.get('href')
|
|
profile_link = profile.get('href')
|
|
rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
rating = int(rating_div.div.get('aria-label').split()[0])
|
|
rating = int(rating_div.div.get('aria-label').split()[0])
|
|
- date = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
|
|
|
|
+ date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
|
|
+ date = date_string2timezone(date_posted)
|
|
review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
|
|
review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
|
|
- review_resource = {
|
|
|
|
- 'name': name,
|
|
|
|
- 'profile': 'https://www.yelp.com' + profile_link,
|
|
|
|
- 'rating': rating,
|
|
|
|
- 'date_posted': date,
|
|
|
|
- 'messege': review_text,
|
|
|
|
- 'location': location
|
|
|
|
- }
|
|
|
|
- review_collection.append(review_resource)
|
|
|
|
- return review_collection
|
|
|
|
|
|
+
|
|
|
|
+ if date > max_date:
|
|
|
|
+ review = {
|
|
|
|
+ 'name': name,
|
|
|
|
+ 'profile': profile_link,
|
|
|
|
+ 'rating': rating,
|
|
|
|
+ 'date_posted': date,
|
|
|
|
+ 'comment': review_text,
|
|
|
|
+ }
|
|
|
|
+ yelp_reviews.append(review)
|
|
|
|
+ return yelp_reviews
|