|
import spacy |
|
|
|
nlp = spacy.load("en_core_web_md") |
|
nlp.add_pipe("entityfishing") |
|
|
|
def remove_plural_names(article): |
|
words = article.split() |
|
new_words = [] |
|
for word in words: |
|
word = word.replace("’s", "") |
|
word = word.replace("'s", "") |
|
new_words.append(word) |
|
return " ".join(new_words) |
|
|
|
|
|
def extract_entities(article): |
|
'''Find wikidata refs for article entities''' |
|
ents = [] |
|
seen_entities = [] |
|
seen_surnames = [] |
|
seen_qids = [] |
|
|
|
article = remove_plural_names(article) |
|
doc = nlp(article) |
|
for ent in doc.ents: |
|
if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities: |
|
continue |
|
if ent._.nerd_score < 0.5: |
|
continue |
|
|
|
if len(ent.text.split()) == 1: |
|
|
|
if ent.text in seen_surnames: |
|
continue |
|
elif ent.label_ == "PERSON": |
|
|
|
seen_surnames.append(ent.text.split()[-1]) |
|
|
|
seen_entities.append(ent.text) |
|
|
|
if ent._.kb_qid in seen_qids: |
|
continue |
|
seen_qids.append(ent._.kb_qid) |
|
ents.append(ent) |
|
return ents |
|
|
|
|
|
if __name__ == "__main__": |
|
ents = extract_entities(input("article: ")) |
|
print() |
|
print("ENTITIES:") |
|
for ent in ents: |
|
print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata) |
|
|