import re def text_cleaner(text): # Remove all emogies which don't contribute anything. emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) text = re.sub(emoji_pattern, ' ', text) rules = [ {r'>\s+': u'>'}, # remove spaces after a tag opens or closes {r'\s+': u' '}, # replace consecutive spaces {r'\s*\s*': u'\n'}, # newline after a
{r'\s*': u'\n'}, # newline after

and and

... {r'\s*': u'\n\n'}, # newline after

and and

... {r'.*<\s*(/head|body)[^>]*>': u''}, # remove to {r']*>.*': r'\1'}, # show links instead of texts {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags {r'^\s+': u''} # remove spaces at the beginning ] for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() return text def text_formatter(text): # replace some letter in text for getting better performance text = re.sub(r':\s*', ' ', text) text = re.sub(r'&', ', ', text) text = re.sub(r'/', ', ', text) text = re.sub(r'\.*\n\.*', '.', text) text = re.sub(r'^[dD][rR](\.|\s*)*', 'Dr. ', text) text = re.sub(r'\s[dD][rR](\.|\s*)*', ' Dr. ', text) return text