12345678910111213141516171819202122232425262728293031323334353637383940 |
- from django.conf import settings
- from django.utils import timezone
- from review.models import Review
- nlp = settings.MODEL
- STOP_WORDS = ['signature', 'care', 'emergency', 'er', 'center', 'nurse', 'dr', 'dr.', 'signaturecare']
- def extract_names_1st(location_id):
- '''
- Extract all names that presented in this month's all reviews
- :param location_id: string -> ID of the location which you want to collect reviews
- :return: names: list -> A list of names that found in reviews
- '''
- reviews = Review.objects.filter(
- location_id=location_id,
- create_time__gte=timezone.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
- )\
- .exclude(comment=None)
- names = []
- for r in reviews:
- doc = nlp(r.comment)
- for e in doc.ents:
- names.append(e.text) if e.label_ in ['PERSON', 'ORG'] else None
- return names
- def extract_names_2nd(names):
- '''
- 2nd order names extraction.
- removing all hand labeled stop words
- :param names: List -> list of name.
- :return: List -> list of name.
- '''
- new_names = []
- for name in names:
- for n in name.split():
- if n.lower() not in STOP_WORDS and len(n) > 2:
- new_names.append(n.lower())
- return new_names
|