utils.py 515 B

123456789101112131415161718192021
  1. import re
  2. from django.conf import settings
  3. nlp = settings.MODEL
  4. def clean_text(text):
  5. # replace some letter in text for getting better performance
  6. text = re.sub(r':\s*', ' ', text)
  7. text = re.sub(r'&', ',', text)
  8. text = re.sub(r'\.*\n\.*', '.', text)
  9. text = re.sub(r'[dD][rR](\.|\s*)*', 'Dr. ', text)
  10. return text
  11. def extract_names(text):
  12. text = clean_text(text)
  13. doc = nlp(text)
  14. names = {ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG']}
  15. return list(names)