|
@@ -0,0 +1,48 @@
|
|
|
+import requests
|
|
|
+import datetime
|
|
|
+from django.utils import timezone
|
|
|
+from bs4 import BeautifulSoup as bSoup
|
|
|
+
|
|
|
+pages_start = [i*20 for i in range(10)]
|
|
|
+
|
|
|
+BASE_URL = 'https://www.yelp.com/biz/signaturecare-emergency-center-montrose-houston-2'
|
|
|
+PARAMETER = '?sort_by=date_desc&start='
|
|
|
+
|
|
|
+
|
|
|
+def date_string2timezone(date):
|
|
|
+ month, day, year = map(int, date.split('/'))
|
|
|
+ date = datetime.datetime(day=day, month=month, year=year)
|
|
|
+ time_zone = timezone.make_aware(date)
|
|
|
+ return time_zone
|
|
|
+
|
|
|
+
|
|
|
+def scrape_reviews(location, n_pages):
|
|
|
+ start_params = pages_start[:n_pages]
|
|
|
+
|
|
|
+ review_collection = []
|
|
|
+ for start in start_params:
|
|
|
+ url = BASE_URL + PARAMETER + str(start)
|
|
|
+ response = requests.get(url).text
|
|
|
+ html_soup = bSoup(response, 'html.parser')
|
|
|
+
|
|
|
+ review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
|
|
|
+
|
|
|
+ # A single review content
|
|
|
+ for review_content in review_items:
|
|
|
+ profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
|
|
|
+ name = profile.getText()
|
|
|
+ profile_link = profile.get('href')
|
|
|
+ rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
|
|
|
+ rating = int(rating_div.div.get('aria-label').split()[0])
|
|
|
+ date = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
|
|
|
+ review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
|
|
|
+ review_resource = {
|
|
|
+ 'name': name,
|
|
|
+ 'profile': 'https://www.yelp.com' + profile_link,
|
|
|
+ 'rating': rating,
|
|
|
+ 'date_posted': date,
|
|
|
+ 'messege': review_text,
|
|
|
+ 'location': location
|
|
|
+ }
|
|
|
+ review_collection.append(review_resource)
|
|
|
+ return review_collection
|