|
@@ -1,4 +1,6 @@
|
|
|
import requests
|
|
|
+import random
|
|
|
+from time import sleep
|
|
|
from bs4 import BeautifulSoup as bSoup
|
|
|
|
|
|
from .utils import date_string2timezone
|
|
@@ -8,18 +10,33 @@ pages_start = [i*20 for i in range(10)]
|
|
|
PARAMETER = '?sort_by=date_desc&start='
|
|
|
|
|
|
|
|
|
+headers = {
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
|
|
|
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
|
+ 'Accept-Language': 'en-US,en;q=0.5',
|
|
|
+ 'Accept-Encoding': 'gzip, deflate, br',
|
|
|
+ 'Upgrade-Insecure-Requests': '1',
|
|
|
+ 'Connection': 'keep-alive'
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
def scrape_reviews(location_url, max_date, n_pages):
|
|
|
start_params = pages_start[:n_pages]
|
|
|
yelp_reviews = []
|
|
|
|
|
|
for start in start_params:
|
|
|
url = location_url + PARAMETER + str(start)
|
|
|
- response = requests.get(url).text
|
|
|
+ response = requests.get(url, headers=headers).text
|
|
|
+
|
|
|
+ # pause for 60-120 to make request more human-like.
|
|
|
+ sleep(random.randint(60, 120))
|
|
|
html_soup = bSoup(response, 'html.parser')
|
|
|
|
|
|
review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
|
|
|
|
|
|
# A single review content
|
|
|
+ if not review_items:
|
|
|
+ return None
|
|
|
for review_content in review_items:
|
|
|
profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
|
|
|
name = profile.getText()
|