scrapper.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. import random
  2. import requests
  3. import mechanicalsoup
  4. from time import sleep
  5. from django.conf import settings
  6. pages_start = [i*10 for i in range(10)]
  7. PARAMETER = '?sort_by=date_desc&start='
  8. BROWSER = getattr(settings, 'BROWSER_URI')
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:50.0) Gecko/20100101 Firefox/50.0',
  11. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  12. 'Accept-Language': 'en-US,en;q=0.5',
  13. 'Accept-Encoding': 'gzip, deflate, br',
  14. 'Upgrade-Insecure-Requests': '1',
  15. 'Connection': 'keep-alive'
  16. }
  17. def scrape_reviews(location_url, n_pages):
  18. start_params = pages_start[:n_pages]
  19. yelp_reviews = []
  20. for start in start_params:
  21. url = location_url + PARAMETER + str(start)
  22. print(f"Collecting reviews from {location_url.split('/')[-1]}: {start}")
  23. browser = mechanicalsoup.StatefulBrowser()
  24. browser.open(url)
  25. # pause for 60-120 to make request more human-like.
  26. sleep(random.randint(30, 60))
  27. html_soup = browser.page
  28. review_items = html_soup.findAll('li', class_='margin-b5__373c0__3ho0z')
  29. print(f'Got {len(review_items)} reviews in this Loc.')
  30. # A single review content
  31. if not review_items:
  32. return None
  33. for review in review_items:
  34. user_detail_div = review.find('div', class_='user-passport-info')
  35. if not user_detail_div:
  36. continue
  37. name = user_detail_div.a.getText()
  38. profile_link = user_detail_div.a.attrs.get('href')
  39. rating_div = review.find('div', class_='i-stars__373c0___sZu0')
  40. rating = int(rating_div.attrs.get('aria-label').split()[0])
  41. date_posted = review.find('span', class_='css-e81eai').getText()
  42. comment_p = review.find('p', class_='comment__373c0__Nsutg css-n6i4z7')
  43. review_text = comment_p.span.getText()
  44. review_dict = {
  45. 'name': name,
  46. 'profile': profile_link,
  47. 'rating': rating,
  48. 'date_posted': date_posted,
  49. 'comment': review_text,
  50. }
  51. yelp_reviews.append(review_dict)
  52. return yelp_reviews
  53. def scrape_reviews_using_browser(location_url):
  54. url = BROWSER + f'/yelp/reviews?url={location_url}'
  55. response = requests.get(url).json()
  56. return response.get('reviews')