Ver código fonte

changed yelp scrapper library beautifulsoup to mechanicalsoup

Mohidul Islam 4 anos atrás
pai
commit
fcb021bbbd
2 arquivos alterados com 21 adições e 17 exclusões
  1. 2 0
      requirements.txt
  2. 19 17
      yelp/scrapper.py

+ 2 - 0
requirements.txt

@@ -20,7 +20,9 @@ idna==2.9
 importlib-metadata==2.0.0
 jdcal==1.4.1
 kiwisolver==1.3.1
+lxml==4.6.3
 matplotlib==3.3.4
+MechanicalSoup==1.0.0
 murmurhash==1.0.2
 mysqlclient==1.4.6
 nameparser==1.0.6

+ 19 - 17
yelp/scrapper.py

@@ -1,10 +1,10 @@
-import requests
 import random
+import requests
+import mechanicalsoup
 from time import sleep
-from bs4 import BeautifulSoup as bSoup
 from django.conf import settings
 
-pages_start = [i*20 for i in range(10)]
+pages_start = [i*10 for i in range(10)]
 
 PARAMETER = '?sort_by=date_desc&start='
 
@@ -21,39 +21,41 @@ headers = {
 }
 
 
-def scrape_reviews(location_url, max_date, n_pages):
+def scrape_reviews(location_url, n_pages):
     start_params = pages_start[:n_pages]
     yelp_reviews = []
 
     for start in start_params:
         url = location_url + PARAMETER + str(start)
-        response = requests.get(url, headers=headers).text
+        browser = mechanicalsoup.StatefulBrowser()
+        browser.open(url)
 
         # pause for 60-120 to make request more human-like.
         sleep(random.randint(30, 60))
-        html_soup = bSoup(response, 'html.parser')
+        html_soup = browser.page
 
-        review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
+        review_items = html_soup.findAll('li', class_='margin-b5__373c0__2ErL8')
 
         # A single review content
         if not review_items:
             return None
-        for review_content in review_items:
-            profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
-            name = profile.getText()
-            profile_link = profile.get('href')
-            rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
-            rating = int(rating_div.div.get('aria-label').split()[0])
-            date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
-            review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
-            review = {
+        for review in review_items:
+            user_detail_div = review.find('div', class_='user-passport-info')
+            name = user_detail_div.a.getText()
+            profile_link = 'https://www.yelp.com' + user_detail_div.a.attrs.get('href')
+            rating_div = review.find('div', class_='i-stars__373c0__1T6rz')
+            rating = int(rating_div.attrs.get('aria-label').split()[0])
+            date_posted = review.find('span', class_='css-e81eai').getText()
+            comment_p = review.find('p', class_='comment__373c0__1M-px css-n6i4z7')
+            review_text = comment_p.span.getText()
+            review_dict = {
                 'name': name,
                 'profile': profile_link,
                 'rating': rating,
                 'date_posted': date_posted,
                 'comment': review_text,
             }
-            yelp_reviews.append(review)
+            yelp_reviews.append(review_dict)
     return yelp_reviews