před 4 roky · 53b61d2bda
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
 
				+env/
			
 
				+.vscode
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
--- a/src/__init__.py
+++ b/src/__init__.py
--- a/src/app.py
+++ b/src/app.py
@@ -0,0 +1,50 @@
 
				+from fastapi import FastAPI, Body
			
 
				+
			
 
				+from .models import get_model
			
 
				+from .clean_text import text_cleaner, text_formatter
			
 
				+
			
 
				+
			
 
				+app = FastAPI(
			
 
				+    title="Name extractor using Spacy",
			
 
				+    description="Extract all names from a row text",
			
 
				+    version="0.1",
			
 
				+)
			
 
				+model = get_model()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@app.get("/")
			
 
				+async def home():
			
 
				+    return {"status": "Server is Up!"}
			
 
				+
			
 
				+
			
 
				+@app.post('/name')
			
 
				+def is_a_human_name(data: dict = Body(...)):
			
 
				+    text = data.get('name')
			
 
				+    doc = model(text)
			
 
				+    is_name = False
			
 
				+    for ent in doc.ents:
			
 
				+        if ent.label_ in ['PERSON']:
			
 
				+            is_name = True
			
 
				+
			
 
				+    return {
			
 
				+        'name': is_name
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+@app.post('/names/all')
			
 
				+def get_all_names(data: dict = Body(...)):
			
 
				+    text = data.get('text')
			
 
				+    print(text)
			
 
				+    text = text_cleaner(text)
			
 
				+    text = text_formatter(text)
			
 
				+    print(text)
			
 
				+    doc = model(text)
			
 
				+    names = {
			
 
				+        ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'PRODUCT']
			
 
				+    }
			
 
				+    print(names)
			
 
				+    return {
			
 
				+        'names': names
			
 
				+    }
			
--- a/src/clean_text.py
+++ b/src/clean_text.py
@@ -0,0 +1,44 @@
 
				+import re
			
 
				+
			
 
				+
			
 
				+def text_cleaner(text):
			
 
				+    # Remove all emogies which don't contribute anything.
			
 
				+    emoji_pattern = re.compile("["
			
 
				+                               u"\U0001F600-\U0001F64F"  # emoticons
			
 
				+                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
			
 
				+                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
			
 
				+                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
			
 
				+                               "]+", flags=re.UNICODE)
			
 
				+    text = re.sub(emoji_pattern, ' ', text)
			
 
				+    
			
 
				+    rules = [
			
 
				+        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
			
 
				+        {r'\s+': u' '},  # replace consecutive spaces
			
 
				+        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
			
 
				+        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
			
 
				+        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
			
 
				+        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
			
 
				+        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
			
 
				+        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
			
 
				+        {r'^\s+': u''}  # remove spaces at the beginning
			
 
				+    ]
			
 
				+
			
 
				+    for rule in rules:
			
 
				+        for (k, v) in rule.items():
			
 
				+            regex = re.compile(k)
			
 
				+            text = regex.sub(v, text)
			
 
				+    text = text.rstrip()
			
 
				+    return text
			
 
				+
			
 
				+
			
 
				+
			
 
				+def text_formatter(text):
			
 
				+    # replace some letter in text for getting better performance
			
 
				+    text = re.sub(r':\s*', ' ', text)
			
 
				+    text = re.sub(r'&', ', ', text)
			
 
				+    text = re.sub(r'/', ', ', text)
			
 
				+    text = re.sub(r'\.*\n\.*', '.', text)
			
 
				+    text = re.sub(r'^[dD][rR](\.|\s*)*', 'Dr. ', text)
			
 
				+    text = re.sub(r'\s[dD][rR](\.|\s*)*', ' Dr. ', text)
			
 
				+    return text
			
 
				+
			
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,50 @@
 
				+from fastapi import FastAPI, Body
			
 
				+
			
 
				+from .models import get_model
			
 
				+from .clean_text import text_cleaner, text_formatter
			
 
				+
			
 
				+
			
 
				+app = FastAPI(
			
 
				+    title="Name extractor using Spacy",
			
 
				+    description="Extract all names from a row text",
			
 
				+    version="0.1",
			
 
				+)
			
 
				+model = get_model()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@app.get("/")
			
 
				+async def home():
			
 
				+    return {"status": "Server is Up!"}
			
 
				+
			
 
				+
			
 
				+@app.post('/name')
			
 
				+def is_a_human_name(data: dict = Body(...)):
			
 
				+    text = data.get('name')
			
 
				+    doc = model(text)
			
 
				+    is_name = False
			
 
				+    for ent in doc.ents:
			
 
				+        if ent.label_ in ['PERSON']:
			
 
				+            is_name = True
			
 
				+
			
 
				+    response =  {
			
 
				+        'name': is_name
			
 
				+    }
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+@app.post('/names/all')
			
 
				+def get_all_names(data: dict = Body(...)):
			
 
				+    text = data.get('text')
			
 
				+    text = text_cleaner(text)
			
 
				+    text = text_formatter(text)
			
 
				+    doc = model(text)
			
 
				+    names = {
			
 
				+        ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'PRODUCT']
			
 
				+    }
			
 
				+
			
 
				+    response =  {
			
 
				+        'names': names
			
 
				+    }
			
 
				+    return response
			
--- a/src/models.py
+++ b/src/models.py
@@ -0,0 +1,11 @@
 
				+import spacy
			
 
				+
			
 
				+MODEL_NAME = 'en_core_web_sm'
			
 
				+
			
 
				+def get_model():
			
 
				+    try:
			
 
				+        spacy_ner_model = spacy.load(MODEL_NAME)
			
 
				+        print(f"\033[92mSpacy model named {spacy_ner_model.meta.get('name')} has loaded!\033[0m")
			
 
				+    except OSError:
			
 
				+        print('\033[93mNo spacy model has found. Please install a small sized spacy model.\033[0m')
			
 
				+    return spacy_ner_model