name_extraction.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. from django.conf import settings
  2. from django.utils import timezone
  3. from review.models import Review
  4. nlp = settings.MODEL
  5. STOP_WORDS = ['signature', 'care', 'emergency', 'er', 'center', 'nurse', 'dr', 'dr.', 'signaturecare']
  6. def extract_names_1st(location_id):
  7. '''
  8. Extract all names that presented in this month's all reviews
  9. :param location_id: string -> ID of the location which you want to collect reviews
  10. :return: names: list -> A list of names that found in reviews
  11. '''
  12. reviews = Review.objects.filter(
  13. location_id=location_id,
  14. create_time__gte=timezone.now().replace(day=1, hour=0, minute=0, second=0, microsecond=0)
  15. )\
  16. .exclude(comment=None)
  17. names = []
  18. for r in reviews:
  19. doc = nlp(r.comment)
  20. for e in doc.ents:
  21. names.append(e.text) if e.label_ in ['PERSON', 'ORG'] else None
  22. return names
  23. def extract_names_2nd(names):
  24. '''
  25. 2nd order names extraction.
  26. removing all hand labeled stop words
  27. :param names: List -> list of name.
  28. :return: List -> list of name.
  29. '''
  30. new_names = []
  31. for name in names:
  32. for n in name.split():
  33. if n.lower() not in STOP_WORDS and len(n) > 2:
  34. new_names.append(n.lower())
  35. return new_names