File size: 1,110 Bytes
8014fee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import spacy

nlp = spacy.load("en_core_web_md")
nlp.add_pipe("entityfishing")


def extract_entities(article):
    '''Find wikidata refs for article entities'''
    ents = []
    seen_entities = []
    seen_surnames = []
    seen_qids = []

    doc = nlp(article)
    for ent in doc.ents:
        if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
            continue
        if ent._.nerd_score < 0.5:
            continue

        if len(ent.text.split()) == 1:
            # Single name
            if ent.text in seen_surnames:
                continue
        elif ent.label_ == "PERSON":
            # Multipart name
            seen_surnames.append(ent.text.split()[-1])

        seen_entities.append(ent.text)

        if ent._.kb_qid in seen_qids:
            continue
        seen_qids.append(ent._.kb_qid)
        ents.append(ent)
    return ents


if __name__ == "__main__":
    ents = extract_entities(input("article: "))
    print()
    print("ENTITIES:")
    for ent in ents:
        print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)