from django.conf import settings from django.utils import timezone from review.models import Review nlp = settings.MODEL STOP_WORDS = ['signature', 'care', 'emergency', 'er', 'center', 'nurse', 'dr', 'dr.', 'signaturecare'] def extract_names_1st(location_id): ''' Extract all names that presented in this month's all reviews :param location_id: string -> ID of the location which you want to collect reviews :return: names: list -> A list of names that found in reviews ''' reviews = Review.objects.filter( location_id=location_id, create_time__gte=timezone.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0) )\ .exclude(comment=None) names = [] for r in reviews: doc = nlp(r.comment) for e in doc.ents: names.append(e.text) if e.label_ in ['PERSON', 'ORG'] else None return names def extract_names_2nd(names): ''' 2nd order names extraction. removing all hand labeled stop words :param names: List -> list of name. :return: List -> list of name. ''' new_names = [] for name in names: for n in name.split(): if n.lower() not in STOP_WORDS and len(n) > 2: new_names.append(n.lower()) return new_names