Entity_Linking_Demo / entity_extraction.py
ghomasHudson's picture
Remove plural names
8b1561c
raw
history blame
1.4 kB
import spacy
nlp = spacy.load("en_core_web_md")
nlp.add_pipe("entityfishing")
def remove_plural_names(article):
words = article.split()
new_words = []
for word in words:
word = word.replace("’s", "")
word = word.replace("'s", "")
new_words.append(word)
return " ".join(new_words)
def extract_entities(article):
'''Find wikidata refs for article entities'''
ents = []
seen_entities = []
seen_surnames = []
seen_qids = []
article = remove_plural_names(article)
doc = nlp(article)
for ent in doc.ents:
if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
continue
if ent._.nerd_score < 0.5:
continue
if len(ent.text.split()) == 1:
# Single name
if ent.text in seen_surnames:
continue
elif ent.label_ == "PERSON":
# Multipart name
seen_surnames.append(ent.text.split()[-1])
seen_entities.append(ent.text)
if ent._.kb_qid in seen_qids:
continue
seen_qids.append(ent._.kb_qid)
ents.append(ent)
return ents
if __name__ == "__main__":
ents = extract_entities(input("article: "))
print()
print("ENTITIES:")
for ent in ents:
print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)