clean_text.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import re
  2. def text_cleaner(text):
  3. # Remove all emogies which don't contribute anything.
  4. emoji_pattern = re.compile("["
  5. u"\U0001F600-\U0001F64F" # emoticons
  6. u"\U0001F300-\U0001F5FF" # symbols & pictographs
  7. u"\U0001F680-\U0001F6FF" # transport & map symbols
  8. u"\U0001F1E0-\U0001F1FF" # flags (iOS)
  9. "]+", flags=re.UNICODE)
  10. text = re.sub(emoji_pattern, ' ', text)
  11. rules = [
  12. {r'>\s+': u'>'}, # remove spaces after a tag opens or closes
  13. {r'\s+': u' '}, # replace consecutive spaces
  14. {r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br>
  15. {r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>...
  16. {r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>...
  17. {r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head>
  18. {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts
  19. {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags
  20. {r'^\s+': u''} # remove spaces at the beginning
  21. ]
  22. for rule in rules:
  23. for (k, v) in rule.items():
  24. regex = re.compile(k)
  25. text = regex.sub(v, text)
  26. text = text.rstrip()
  27. return text
  28. def text_formatter(text):
  29. # replace some letter in text for getting better performance
  30. text = re.sub(r':\s*', ' ', text)
  31. text = re.sub(r'&', ', ', text)
  32. text = re.sub(r'/', ', ', text)
  33. text = re.sub(r'\.*\n\.*', '.', text)
  34. text = re.sub(r'^[dD][rR](\.|\s*)*', 'Dr. ', text)
  35. text = re.sub(r'\s[dD][rR](\.|\s*)*', ' Dr. ', text)
  36. return text