scrapper.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import requests
  2. from bs4 import BeautifulSoup as bSoup
  3. from .utils import date_string2timezone
  4. pages_start = [i*20 for i in range(10)]
  5. PARAMETER = '?sort_by=date_desc&start='
  6. def scrape_reviews(location_url, max_date, n_pages):
  7. start_params = pages_start[:n_pages]
  8. yelp_reviews = []
  9. for start in start_params:
  10. url = location_url + PARAMETER + str(start)
  11. response = requests.get(url).text
  12. html_soup = bSoup(response, 'html.parser')
  13. review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
  14. # A single review content
  15. for review_content in review_items:
  16. profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
  17. name = profile.getText()
  18. profile_link = profile.get('href')
  19. rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
  20. rating = int(rating_div.div.get('aria-label').split()[0])
  21. date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
  22. date = date_string2timezone(date_posted)
  23. review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text
  24. if date > max_date:
  25. review = {
  26. 'name': name,
  27. 'profile': profile_link,
  28. 'rating': rating,
  29. 'date_posted': date,
  30. 'comment': review_text,
  31. }
  32. yelp_reviews.append(review)
  33. return yelp_reviews