scrapper.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import requests
  2. import random
  3. from time import sleep
  4. from bs4 import BeautifulSoup as bSoup
  5. from django.conf import settings
  6. pages_start = [i*20 for i in range(10)]
  7. PARAMETER = '?sort_by=date_desc&start='
  8. BROWSER = getattr(settings, 'BROWSER_URI')
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
  11. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  12. 'Accept-Language': 'en-US,en;q=0.5',
  13. 'Accept-Encoding': 'gzip, deflate, br',
  14. 'Upgrade-Insecure-Requests': '1',
  15. 'Connection': 'keep-alive'
  16. }
  17. def scrape_reviews(location_url, max_date, n_pages):
  18. start_params = pages_start[:n_pages]
  19. yelp_reviews = []
  20. for start in start_params:
  21. url = location_url + PARAMETER + str(start)
  22. response = requests.get(url, headers=headers).text
  23. # pause for 60-120 to make request more human-like.
  24. sleep(random.randint(30, 60))
  25. html_soup = bSoup(response, 'html.parser')
  26. review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU')
  27. # A single review content
  28. if not review_items:
  29. return None
  30. for review_content in review_items:
  31. profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE')
  32. name = profile.getText()
  33. profile_link = profile.get('href')
  34. rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU')
  35. rating = int(rating_div.div.get('aria-label').split()[0])
  36. date_posted = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text
  37. review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rcx7').text
  38. review = {
  39. 'name': name,
  40. 'profile': profile_link,
  41. 'rating': rating,
  42. 'date_posted': date_posted,
  43. 'comment': review_text,
  44. }
  45. yelp_reviews.append(review)
  46. return yelp_reviews
  47. def scrape_reviews_using_browser(location_url):
  48. url = BROWSER + f'/yelp/reviews?url={location_url}'
  49. response = requests.get(url).json()
  50. return response.get('reviews')