Mohidul Islam 4 лет назад
Родитель
Сommit
dee67c53c6
2 измененных файлов с 7 добавлено и 3 удалено
  1. 3 1
      yelp/scrapper.py
  2. 4 2
      yelp/utils.py

+ 3 - 1
yelp/scrapper.py

@@ -41,8 +41,10 @@ def scrape_reviews(location_url, n_pages):
             return None
         for review in review_items:
             user_detail_div = review.find('div', class_='user-passport-info')
+            if not user_detail_div:
+                continue
             name = user_detail_div.a.getText()
-            profile_link = 'https://www.yelp.com' + user_detail_div.a.attrs.get('href')
+            profile_link = user_detail_div.a.attrs.get('href')
             rating_div = review.find('div', class_='i-stars__373c0__1T6rz')
             rating = int(rating_div.attrs.get('aria-label').split()[0])
             date_posted = review.find('span', class_='css-e81eai').getText()

+ 4 - 2
yelp/utils.py

@@ -5,7 +5,7 @@ from django.utils import timezone
 from django.db.models import Max
 
 from .models import YelpReview, YelpLocation
-from .scrapper import scrape_reviews_using_browser
+from .scrapper import scrape_reviews
 
 
 def date_string2timezone(date):
@@ -41,12 +41,14 @@ def store_into_database(reviews, location):
         )
         if created:
             print(f'A new review object has been created for {location}!')
+        else:
+            print(f'Review already exist')
 
 
 def populate_yelp_reviews():
     yelp_locations = YelpLocation.objects.all()
     for yl in yelp_locations:
-        reviews = scrape_reviews_using_browser(location_url=yl.url)
+        reviews = scrape_reviews(location_url=yl.url, n_pages=2)
         store_into_database(reviews, yl)
         # pause for 30-60 sec to make request more human-like.
         sleep(random.randint(30, 60))