scrapper.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import requests
  2. import datetime
  3. from django.utils import timezone
  4. from bs4 import BeautifulSoup as bSoup
  5. pages_start = [i*20 for i in range(10)]
  6. BASE_URL = 'https://www.yelp.com/biz/signaturecare-emergency-center-montrose-houston-2'
  7. PARAMETER = '?sort_by=date_desc&start='
  8. def date_string2timezone(date):
  9. month, day, year = map(int, date.split('/'))
  10. date = datetime.datetime(day=day, month=month, year=year)
  11. time_zone = timezone.make_aware(date)
  12. return time_zone
  13. def scrape_reviews(location, n_pages):
  14. start_params = pages_start[:n_pages]
  15. review_collection = []
  16. for start in start_params:
  17. url = BASE_URL + PARAMETER + str(start)
  18. response = requests.get(url).text
  19. html_soup = bSoup(response, 'html.parser')
  20. review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
  21. # A single review content
  22. for review_content in review_items:
  23. profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
  24. name = profile.getText()
  25. profile_link = profile.get('href')
  26. rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
  27. rating = int(rating_div.div.get('aria-label').split()[0])
  28. date = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
  29. review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
  30. review_resource = {
  31. 'name': name,
  32. 'profile': 'https://www.yelp.com' + profile_link,
  33. 'rating': rating,
  34. 'date_posted': date,
  35. 'messege': review_text,
  36. 'location': location
  37. }
  38. review_collection.append(review_resource)
  39. return review_collection