import spacy nlp = spacy.load("en_core_web_md") nlp.add_pipe("entityfishing") def remove_plural_names(article): words = article.split() new_words = [] for word in words: word = word.replace("’s", "") word = word.replace("'s", "") new_words.append(word) return " ".join(new_words) def extract_entities(article): '''Find wikidata refs for article entities''' ents = [] seen_entities = [] seen_surnames = [] seen_qids = [] article = remove_plural_names(article) doc = nlp(article) for ent in doc.ents: if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities: continue if ent._.nerd_score < 0.5: continue if len(ent.text.split()) == 1: # Single name if ent.text in seen_surnames: continue elif ent.label_ == "PERSON": # Multipart name seen_surnames.append(ent.text.split()[-1]) seen_entities.append(ent.text) if ent._.kb_qid in seen_qids: continue seen_qids.append(ent._.kb_qid) ents.append(ent) return ents if __name__ == "__main__": ents = extract_entities(input("article: ")) print() print("ENTITIES:") for ent in ents: print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)