ByteTrekProjects
/
fast-spacy


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243
							import re


def text_cleaner(text):
    # Remove all emogies which don't contribute anything.
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = re.sub(emoji_pattern, ' ', text)
    
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]

    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
    text = text.rstrip()
    return text


def text_formatter(text):
    # replace some letter in text for getting better performance
    text = re.sub(r':\s*', ' ', text)
    text = re.sub(r'&', ', ', text)
    text = re.sub(r'/', ', ', text)
    text = re.sub(r'\.*\n\.*', '.', text)
    text = re.sub(r'^[dD][rR](\.|\s*)*', 'Dr. ', text)
    text = re.sub(r'\s[dD][rR](\.|\s*)*', ' Dr. ', text)
    return text