ghomasHudson commited on
Commit
8014fee
1 Parent(s): a50857e

Added cli standalone

Browse files
Files changed (3) hide show
  1. README.md +5 -0
  2. app.py +8 -35
  3. entity_extraction.py +43 -0
README.md CHANGED
@@ -10,3 +10,8 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ## Quickstart
16
+
17
+ For a simple GUI, run `streamlit run app.py`. For CLI usage, run `entity_extraction.py`.
app.py CHANGED
@@ -1,49 +1,23 @@
1
  import streamlit as st
2
  import streamlit.components.v1 as components
3
  import requests
4
- import spacy
5
  import hashlib
6
-
7
- nlp = spacy.load("en_core_web_md")
8
-
9
- # add pipeline (declared through entry_points in setup.py)
10
- nlp.add_pipe("entityfishing")
11
-
12
 
13
  st.title('Entity Linking Demo')
 
 
14
 
 
15
 
16
  article = st.text_area('Article to analyze:', value=open("example.txt").read())
17
 
18
- seen_entities = []
19
- seen_surnames = []
20
- seen_qids = []
21
  if st.button('Submit'):
22
- good_ents = []
23
-
24
- with st.spinner(text="Analysing..."):
25
- doc = nlp(article)
26
- for ent in doc.ents:
27
- if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
28
- continue
29
- if ent._.nerd_score < 0.5:
30
- continue
31
-
32
- if len(ent.text.split()) == 1:
33
- # Single name
34
- if ent.text in seen_surnames:
35
- continue
36
- elif ent.label_ == "PERSON":
37
- # Multipart name
38
- seen_surnames.append(ent.text.split()[-1])
39
-
40
- seen_entities.append(ent.text)
41
- print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))
42
-
43
- if ent._.kb_qid in seen_qids:
44
- continue
45
- seen_qids.append(ent._.kb_qid)
46
 
 
 
47
  r = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity=" + ent._.kb_qid)
48
  data = r.json()["claims"]
49
  if "P18" in data.keys():
@@ -56,7 +30,6 @@ if st.button('Submit'):
56
  good_ents.append((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score, url))
57
  cols = st.columns(len(good_ents))
58
  for i, ent in enumerate(good_ents):
59
- # st.image(url)
60
  with cols[i]:
61
  components.html(f"<image style='border-radius: 50%;object-fit:cover;width:100px;height:100px' src='{ent[-1]}'/>", height=110, width=110)
62
  st.caption(ent[0])
 
1
  import streamlit as st
2
  import streamlit.components.v1 as components
3
  import requests
 
4
  import hashlib
5
+ from entity_extraction import extract_entities
 
 
 
 
 
6
 
7
  st.title('Entity Linking Demo')
8
+ st.markdown("""Linking named entities in an article to
9
+ wikidata entries (allowing us to pull the images).
10
 
11
+ *Note: Only trained on entities before May 2020*""")
12
 
13
  article = st.text_area('Article to analyze:', value=open("example.txt").read())
14
 
 
 
 
15
  if st.button('Submit'):
16
+ with st.spinner(text="Extracting..."):
17
+ good_ents = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ ents = extract_entities(article)
20
+ for i, ent in enumerate(ents):
21
  r = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity=" + ent._.kb_qid)
22
  data = r.json()["claims"]
23
  if "P18" in data.keys():
 
30
  good_ents.append((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score, url))
31
  cols = st.columns(len(good_ents))
32
  for i, ent in enumerate(good_ents):
 
33
  with cols[i]:
34
  components.html(f"<image style='border-radius: 50%;object-fit:cover;width:100px;height:100px' src='{ent[-1]}'/>", height=110, width=110)
35
  st.caption(ent[0])
entity_extraction.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+
3
+ nlp = spacy.load("en_core_web_md")
4
+ nlp.add_pipe("entityfishing")
5
+
6
+
7
+ def extract_entities(article):
8
+ '''Find wikidata refs for article entities'''
9
+ ents = []
10
+ seen_entities = []
11
+ seen_surnames = []
12
+ seen_qids = []
13
+
14
+ doc = nlp(article)
15
+ for ent in doc.ents:
16
+ if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
17
+ continue
18
+ if ent._.nerd_score < 0.5:
19
+ continue
20
+
21
+ if len(ent.text.split()) == 1:
22
+ # Single name
23
+ if ent.text in seen_surnames:
24
+ continue
25
+ elif ent.label_ == "PERSON":
26
+ # Multipart name
27
+ seen_surnames.append(ent.text.split()[-1])
28
+
29
+ seen_entities.append(ent.text)
30
+
31
+ if ent._.kb_qid in seen_qids:
32
+ continue
33
+ seen_qids.append(ent._.kb_qid)
34
+ ents.append(ent)
35
+ return ents
36
+
37
+
38
+ if __name__ == "__main__":
39
+ ents = extract_entities(input("article: "))
40
+ print()
41
+ print("ENTITIES:")
42
+ for ent in ents:
43
+ print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)