import requests import datetime from django.utils import timezone from bs4 import BeautifulSoup as bSoup pages_start = [i*20 for i in range(10)] BASE_URL = 'https://www.yelp.com/biz/signaturecare-emergency-center-montrose-houston-2' PARAMETER = '?sort_by=date_desc&start=' def date_string2timezone(date): month, day, year = map(int, date.split('/')) date = datetime.datetime(day=day, month=month, year=year) time_zone = timezone.make_aware(date) return time_zone def scrape_reviews(location, n_pages): start_params = pages_start[:n_pages] review_collection = [] for start in start_params: url = BASE_URL + PARAMETER + str(start) response = requests.get(url).text html_soup = bSoup(response, 'html.parser') review_items = html_soup.findAll('li', class_='lemon--li__373c0__1r9wz margin-b3__373c0__q1DuY padding-b3__373c0__342DA border--bottom__373c0__3qNtD border-color--default__373c0__3-ifU') # A single review content for review_content in review_items: profile = review_content.find('a', class_='lemon--a__373c0__IEZFH link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE') name = profile.getText() profile_link = profile.get('href') rating_div = review_content.find('span', class_='lemon--span__373c0__3997G display--inline__373c0__3JqBP border-color--default__373c0__3-ifU') rating = int(rating_div.div.get('aria-label').split()[0]) date = review_content.find('span', class_='lemon--span__373c0__3997G text__373c0__2Kxyz text-color--mid__373c0__jCeOG text-align--left__373c0__2XGa-').text review_text = review_content.find('span', class_='lemon--span__373c0__3997G raw__373c0__3rKqk').text review_resource = { 'name': name, 'profile': 'https://www.yelp.com' + profile_link, 'rating': rating, 'date_posted': date, 'messege': review_text, 'location': location } review_collection.append(review_resource) return review_collection