Spaces:

nsstt
/

Entity_Linking_Demo

App Files Files Community

ghomasHudson commited on Mar 19, 2023

Commit

8014fee

•

1 Parent(s): a50857e

Added cli standalone

Browse files

Files changed (3) hide show

README.md +5 -0
app.py +8 -35
entity_extraction.py +43 -0

README.md CHANGED Viewed

@@ -10,3 +10,8 @@ pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+## Quickstart
+For a simple GUI, run `streamlit run app.py`. For CLI usage, run `entity_extraction.py`.

app.py CHANGED Viewed

@@ -1,49 +1,23 @@
 import streamlit as st
 import streamlit.components.v1 as components
 import requests
-import spacy
 import hashlib
-nlp = spacy.load("en_core_web_md")
-# add pipeline (declared through entry_points in setup.py)
-nlp.add_pipe("entityfishing")
 st.title('Entity Linking Demo')
 article = st.text_area('Article to analyze:', value=open("example.txt").read())
-seen_entities = []
-seen_surnames = []
-seen_qids = []
 if st.button('Submit'):
-    good_ents = []
-    with st.spinner(text="Analysing..."):
-        doc = nlp(article)
-        for ent in doc.ents:
-            if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
-                continue
-            if ent._.nerd_score < 0.5:
-                continue
-            if len(ent.text.split()) == 1:
-                # Single name
-                if ent.text in seen_surnames:
-                    continue
-            elif ent.label_ == "PERSON":
-                # Multipart name
-                seen_surnames.append(ent.text.split()[-1])
-            seen_entities.append(ent.text)
-            print((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score))
-            if ent._.kb_qid in seen_qids:
-                continue
-            seen_qids.append(ent._.kb_qid)
             r = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity=" + ent._.kb_qid)
             data = r.json()["claims"]
             if "P18" in data.keys():
@@ -56,7 +30,6 @@ if st.button('Submit'):
                 good_ents.append((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score, url))
         cols = st.columns(len(good_ents))
         for i, ent in enumerate(good_ents):
-            # st.image(url)
             with cols[i]:
                 components.html(f"<image style='border-radius: 50%;object-fit:cover;width:100px;height:100px' src='{ent[-1]}'/>", height=110, width=110)
                 st.caption(ent[0])

 import streamlit as st
 import streamlit.components.v1 as components
 import requests
 import hashlib
+from entity_extraction import extract_entities
 st.title('Entity Linking Demo')
+st.markdown("""Linking named entities in an article to
+wikidata entries (allowing us to pull the images).
+*Note: Only trained on entities before May 2020*""")
 article = st.text_area('Article to analyze:', value=open("example.txt").read())
 if st.button('Submit'):
+    with st.spinner(text="Extracting..."):
+        good_ents = []
+        ents = extract_entities(article)
+        for i, ent in enumerate(ents):
             r = requests.get("https://www.wikidata.org/w/api.php?action=wbgetclaims&format=json&property=P18&entity=" + ent._.kb_qid)
             data = r.json()["claims"]
             if "P18" in data.keys():
                 good_ents.append((ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score, url))
         cols = st.columns(len(good_ents))
         for i, ent in enumerate(good_ents):
             with cols[i]:
                 components.html(f"<image style='border-radius: 50%;object-fit:cover;width:100px;height:100px' src='{ent[-1]}'/>", height=110, width=110)
                 st.caption(ent[0])

entity_extraction.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import spacy
+nlp = spacy.load("en_core_web_md")
+nlp.add_pipe("entityfishing")
+def extract_entities(article):
+    '''Find wikidata refs for article entities'''
+    ents = []
+    seen_entities = []
+    seen_surnames = []
+    seen_qids = []
+    doc = nlp(article)
+    for ent in doc.ents:
+        if ent._.kb_qid is None or ent.label_ not in ["ORG", "PERSON", "GPE"] or ent.text in seen_entities:
+            continue
+        if ent._.nerd_score < 0.5:
+            continue
+        if len(ent.text.split()) == 1:
+            # Single name
+            if ent.text in seen_surnames:
+                continue
+        elif ent.label_ == "PERSON":
+            # Multipart name
+            seen_surnames.append(ent.text.split()[-1])
+        seen_entities.append(ent.text)
+        if ent._.kb_qid in seen_qids:
+            continue
+        seen_qids.append(ent._.kb_qid)
+        ents.append(ent)
+    return ents
+if __name__ == "__main__":
+    ents = extract_entities(input("article: "))
+    print()
+    print("ENTITIES:")
+    for ent in ents:
+        print(ent.text, "\t", ent.label_, "\t", ent._.url_wikidata)