Ver código fonte

Make scrpe request much more human-like

Mohidul Islam 5 anos atrás
pai
commit
af61f93781
3 arquivos alterados com 29 adições e 3 exclusões
  1. 1 0
      yelp/admin.py
  2. 18 1
      yelp/scrapper.py
  3. 10 2
      yelp/store_reviews.py

+ 1 - 0
yelp/admin.py

@@ -11,6 +11,7 @@ class YelpReviewAdmin(admin.ModelAdmin):
     list_display = ['reviewer_name', 'rating', 'date_posted', 'comment']
     search_fields = ['reviewer_name', 'comment']
     list_filter = ('location', 'rating',)
+    ordering = ['-date_posted']
 
 
 admin.site.register(YelpLocation, YelpLocationAdmin)

+ 18 - 1
yelp/scrapper.py

@@ -1,4 +1,6 @@
 import requests
+import random
+from time import sleep
 from bs4 import BeautifulSoup as bSoup
 
 from .utils import date_string2timezone
@@ -8,18 +10,33 @@ pages_start = [i*20 for i in range(10)]
 PARAMETER = '?sort_by=date_desc&start='
 
 
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Accept-Encoding': 'gzip, deflate, br',
+    'Upgrade-Insecure-Requests': '1',
+    'Connection': 'keep-alive'
+}
+
+
 def scrape_reviews(location_url, max_date, n_pages):
     start_params = pages_start[:n_pages]
     yelp_reviews = []
 
     for start in start_params:
         url = location_url + PARAMETER + str(start)
-        response = requests.get(url).text
+        response = requests.get(url, headers=headers).text
+
+        # pause for 60-120 to make request more human-like.
+        sleep(random.randint(60, 120))
         html_soup = bSoup(response, 'html.parser')
 
         review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
 
         # A single review content
+        if not review_items:
+            return None
         for review_content in review_items:
             profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
             name = profile.getText()

+ 10 - 2
yelp/store_reviews.py

@@ -1,8 +1,9 @@
 from .scrapper import scrape_reviews
 from .utils import get_max_date, store_into_database
+from .models import YelpLocation
 
 
-def populate_yelp_reviews(location, n_pages):
+def store_yelp_reviews(location, n_pages):
     location_url = location.url
     max_date = get_max_date(location)
     reviews = scrape_reviews(
@@ -10,4 +11,11 @@ def populate_yelp_reviews(location, n_pages):
         max_date=max_date,
         n_pages=n_pages
     )
-    store_into_database(reviews, location)
+    if reviews:
+        store_into_database(reviews, location)
+
+
+def populate_reviews():
+    locations = YelpLocation.objects.all()
+    for yl in locations:
+        store_yelp_reviews(yl, 1)