Browse Source

Add some utility function and change model field

Mohidul Islam 5 năm trước cách đây
mục cha
commit
51f1346183
5 tập tin đã thay đổi với 89 bổ sung25 xóa
  1. 18 0
      yelp/migrations/0002_auto_20200609_0829.py
  2. 1 1
      yelp/models.py
  3. 18 24
      yelp/scrapper.py
  4. 13 0
      yelp/store_reviews.py
  5. 39 0
      yelp/utils.py

+ 18 - 0
yelp/migrations/0002_auto_20200609_0829.py

@@ -0,0 +1,18 @@
+# Generated by Django 3.0.4 on 2020-06-09 08:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('yelp', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='yelpreview',
+            name='date_posted',
+            field=models.DateTimeField(),
+        ),
+    ]

+ 1 - 1
yelp/models.py

@@ -14,7 +14,7 @@ class YelpReview(models.Model):
     reviewer_name = models.CharField(max_length=255)
     profile = models.URLField()
     rating = models.IntegerField()
-    date_posted = models.DateField()
+    date_posted = models.DateTimeField()
     comment = models.TextField()
     location = models.ForeignKey(YelpLocation, on_delete=models.CASCADE)
 

+ 18 - 24
yelp/scrapper.py

@@ -1,27 +1,19 @@
 import requests
-import datetime
-from django.utils import timezone
 from bs4 import BeautifulSoup as bSoup
 
+from .utils import date_string2timezone
+
 pages_start = [i*20 for i in range(10)]
 
-BASE_URL = 'https://www.yelp.com/biz/signaturecare-emergency-center-montrose-houston-2'
 PARAMETER = '?sort_by=date_desc&start='
 
 
-def date_string2timezone(date):
-    month, day, year = map(int,  date.split('/'))
-    date = datetime.datetime(day=day, month=month, year=year)
-    time_zone = timezone.make_aware(date)
-    return time_zone
-
-
-def scrape_reviews(location, n_pages):
+def scrape_reviews(location_url, max_date, n_pages):
     start_params = pages_start[:n_pages]
+    yelp_reviews = []
 
-    review_collection = []
     for start in start_params:
-        url = BASE_URL + PARAMETER + str(start)
+        url = location_url + PARAMETER + str(start)
         response = requests.get(url).text
         html_soup = bSoup(response, 'html.parser')
 
@@ -34,15 +26,17 @@ def scrape_reviews(location, n_pages):
             profile_link = profile.get('href')
             rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
             rating = int(rating_div.div.get('aria-label').split()[0])
-            date = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
+            date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
+            date = date_string2timezone(date_posted)
             review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
-            review_resource = {
-                'name': name,
-                'profile': 'https://www.yelp.com' + profile_link,
-                'rating': rating,
-                'date_posted': date,
-                'messege': review_text,
-                'location': location
-            }
-            review_collection.append(review_resource)
-    return review_collection
+
+            if date > max_date:
+                review = {
+                    'name': name,
+                    'profile': profile_link,
+                    'rating': rating,
+                    'date_posted': date,
+                    'comment': review_text,
+                }
+                yelp_reviews.append(review)
+    return yelp_reviews

+ 13 - 0
yelp/store_reviews.py

@@ -0,0 +1,13 @@
+from .scrapper import scrape_reviews
+from .utils import get_max_date, store_into_database
+
+
+def populate_yelp_reviews(location, n_pages):
+    location_url = location.url
+    max_date = get_max_date(location)
+    reviews = scrape_reviews(
+        location_url=location_url,
+        max_date=max_date,
+        n_pages=n_pages
+    )
+    store_into_database(reviews, location)

+ 39 - 0
yelp/utils.py

@@ -0,0 +1,39 @@
+import datetime
+from django.utils import timezone
+from django.db.models import Max
+
+from .models import YelpReview, YelpLocation
+
+
+def date_string2timezone(date):
+    month, day, year = map(int,  date.split('/'))
+    date = datetime.datetime(day=day, month=month, year=year)
+    time_zone = timezone.make_aware(date)
+    return time_zone
+
+
+def get_max_date(yelp_location):
+    max_date = yelp_location.yelpreview_set.all()\
+        .aggregate(Max('date_posted'))['date_posted__max']
+    return max_date if max_date is not None else date_string2timezone('7/2/1995')
+
+
+def store_into_database(reviews, location):
+    for rev in reviews:
+        name = rev.get('name')
+        profile = rev.get('profile')
+        rating = rev.get('rating')
+        date_posted = rev.get('date_posted')
+        comment = rev.get('comment')
+
+        # store into database
+        obj, created = YelpReview.objects.update_or_create(
+                reviewer_name=name,
+                profile=profile,
+                rating=rating,
+                date_posted=date_posted,
+                comment=comment,
+                location=location
+        )
+        if created:
+            print('A new object has been created!')