|
@@ -0,0 +1,44 @@
|
|
|
+import re
|
|
|
+
|
|
|
+
|
|
|
+def text_cleaner(text):
|
|
|
+
|
|
|
+ emoji_pattern = re.compile("["
|
|
|
+ u"\U0001F600-\U0001F64F"
|
|
|
+ u"\U0001F300-\U0001F5FF"
|
|
|
+ u"\U0001F680-\U0001F6FF"
|
|
|
+ u"\U0001F1E0-\U0001F1FF"
|
|
|
+ "]+", flags=re.UNICODE)
|
|
|
+ text = re.sub(emoji_pattern, ' ', text)
|
|
|
+
|
|
|
+ rules = [
|
|
|
+ {r'>\s+': u'>'},
|
|
|
+ {r'\s+': u' '},
|
|
|
+ {r'\s*<br\s*/?>\s*': u'\n'},
|
|
|
+ {r'</(div)\s*>\s*': u'\n'},
|
|
|
+ {r'</(p|h\d)\s*>\s*': u'\n\n'},
|
|
|
+ {r'<head>.*<\s*(/head|body)[^>]*>': u''},
|
|
|
+ {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},
|
|
|
+ {r'[ \t]*<[^<]*?/?>': u''},
|
|
|
+ {r'^\s+': u''}
|
|
|
+ ]
|
|
|
+
|
|
|
+ for rule in rules:
|
|
|
+ for (k, v) in rule.items():
|
|
|
+ regex = re.compile(k)
|
|
|
+ text = regex.sub(v, text)
|
|
|
+ text = text.rstrip()
|
|
|
+ return text
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def text_formatter(text):
|
|
|
+
|
|
|
+ text = re.sub(r':\s*', ' ', text)
|
|
|
+ text = re.sub(r'&', ', ', text)
|
|
|
+ text = re.sub(r'/', ', ', text)
|
|
|
+ text = re.sub(r'\.*\n\.*', '.', text)
|
|
|
+ text = re.sub(r'^[dD][rR](\.|\s*)*', 'Dr. ', text)
|
|
|
+ text = re.sub(r'\s[dD][rR](\.|\s*)*', ' Dr. ', text)
|
|
|
+ return text
|
|
|
+
|