import spacy nlp = spacy.load("en_core_web_md") nlp.add_pipe("entityfishing") def extract_entities(article): '''Find wikidata refs for article entities''' ents = [] seen_entities = [] seen_surnames = [] seen_qids = [] doc = nlp(article) for ent in doc.ents: if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities: continue if ent._.nerd_score < 0.5: continue if len(ent.text.split()) == 1: # Single name if ent.text in seen_surnames: continue elif ent.label_ == "PERSON": # Multipart name seen_surnames.append(ent.text.split()[-1]) seen_entities.append(ent.text) if ent._.kb_qid in seen_qids: continue seen_qids.append(ent._.kb_qid) ents.append(ent) return ents if __name__ == "__main__": ents = extract_entities(input("article: ")) print() print("ENTITIES:") for ent in ents: print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)