|
@@ -31,6 +31,13 @@ def clean_text(text):
|
|
text = re.sub(r'/', ', ', text)
|
|
text = re.sub(r'/', ', ', text)
|
|
text = re.sub(r'\.*\n\.*', ', ', text)
|
|
text = re.sub(r'\.*\n\.*', ', ', text)
|
|
text = re.sub(r'[dD][rR](\.|\s*)*', 'Dr. ', text)
|
|
text = re.sub(r'[dD][rR](\.|\s*)*', 'Dr. ', text)
|
|
|
|
+ emoji_pattern = re.compile("["
|
|
|
|
+ u"\U0001F600-\U0001F64F" # emoticons
|
|
|
|
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
|
|
|
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
|
|
|
|
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
|
|
|
+ "]+", flags=re.UNICODE)
|
|
|
|
+ text = re.sub(emoji_pattern, ' ', text)
|
|
return text
|
|
return text
|
|
|
|
|
|
|
|
|