import spacy

nlp = spacy.load("en_core_web_md")
nlp.add_pipe("entityfishing")

def remove_plural_names(article):
    words = article.split()
    new_words = []
    for word in words:
        word = word.replace("’s", "")
        word = word.replace("'s", "")
        new_words.append(word)
    return " ".join(new_words)


def extract_entities(article):
    '''Find wikidata refs for article entities'''
    ents = []
    seen_entities = []
    seen_surnames = []
    seen_qids = []

    article = remove_plural_names(article)
    doc = nlp(article)
    for ent in doc.ents:
        if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
            continue
        if ent._.nerd_score < 0.5:
            continue

        if len(ent.text.split()) == 1:
            # Single name
            if ent.text in seen_surnames:
                continue
        elif ent.label_ == "PERSON":
            # Multipart name
            seen_surnames.append(ent.text.split()[-1])

        seen_entities.append(ent.text)

        if ent._.kb_qid in seen_qids:
            continue
        seen_qids.append(ent._.kb_qid)
        ents.append(ent)
    return ents


if __name__ == "__main__":
    ents = extract_entities(input("article: "))
    print()
    print("ENTITIES:")
    for ent in ents:
        print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)