lingbionlp's picture
Update app.py
7659c77
raw
history blame
7.98 kB
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 21 16:21:25 2022
@author: luol2
"""
import streamlit as st
from src.nn_model import bioTag_CNN,bioTag_Bioformer
from src.dic_ner import dic_ont
from src.tagging_text import bioTag
import os
import json
from pandas import DataFrame
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
st.set_page_config(
page_title="PhenoTagger",
page_icon="🎈",
layout="wide",
menu_items={
'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
'About': "PhenoTagger v1.1"
}
)
# def _max_width_():
# max_width_str = f"max-width: 2400px;"
# st.markdown(
# f"""
# <style>
# .reportview-container .main .block-container{{
# {max_width_str}
# }}
# </style>
# """,
# unsafe_allow_html=True,
# )
# _max_width_()
# c30, c31, c32 = st.columns([2.5, 1, 3])
# with c30:
# # st.image("logo.png", width=400)
st.title("👨‍⚕️ PhenoTagger Demo")
with st.expander("ℹ️ - About this app", expanded=True):
st.write(
"""
- This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
- PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
"""
)
st.markdown("")
st.markdown("")
st.markdown("## 📌 Paste document ")
with st.form(key="my_form"):
ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
with c1:
ModelType = st.radio(
"Choose your model",
["Bioformer(Default)", "CNN"],
help="Bioformer is more precise, CNN is more efficient",
)
if ModelType == "Bioformer(Default)":
# kw_model = KeyBERT(model=roberta)
@st.cache(allow_output_mutation=True)
def load_model():
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
'word_hpo_file':'./dict_new/word_id_map.json',
'hpo_word_file':'./dict_new/id_word_map.json'}
vocabfiles={'labelfile':'./dict_new/lable.vocab',
'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
biotag_dic=dic_ont(ontfiles)
nn_model=bioTag_Bioformer(vocabfiles)
nn_model.load_model(modelfile)
return nn_model,biotag_dic
nn_model,biotag_dic = load_model()
else:
@st.cache(allow_output_mutation=True)
def load_model():
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
'word_hpo_file':'./dict_new/word_id_map.json',
'hpo_word_file':'./dict_new/id_word_map.json'}
vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200',
'charfile':'./vocab/char.vocab',
'labelfile':'./dict_new/lable.vocab',
'posfile':'./vocab/pos.vocab'}
modelfile='./models/cnn_p5n5_b128_95_hponew1.h5'
biotag_dic=dic_ont(ontfiles)
nn_model=bioTag_CNN(vocabfiles)
nn_model.load_model(modelfile)
return nn_model,biotag_dic
nn_model,biotag_dic = load_model()
para_overlap = st.checkbox(
"Overlap concept",
value=True,
help="Tick this box to identify overlapping concepts",
)
para_abbr = st.checkbox(
"Abbreviaitons",
value=True,
help="Tick this box to identify abbreviations",
)
para_threshold = st.slider(
"Threshold",
min_value=0.5,
max_value=1.0,
value=0.95,
step=0.05,
help="Retrun the preditions which socre over the threshold.",
)
with c2:
doc = st.text_area(
"Paste your text below",
height=400,
)
# MAX_WORDS = 500
# import re
# res = len(re.findall(r"\w+", doc))
# if res > MAX_WORDS:
# st.warning(
# "⚠️ Your text contains "
# + str(res)
# + " words."
# + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
# )
# doc = doc[:MAX_WORDS]
submit_button = st.form_submit_button(label="✨ Submit!")
if not submit_button:
st.stop()
para_set={
#model_type':para_model, # cnn or bioformer
'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
'ML_Threshold':para_threshold,# the Threshold of deep learning model
}
st.markdown("")
st.markdown("## 💡 Tagging results:")
with st.spinner('Wait for tagging...'):
tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True)
# print('dic...........:',biotag_dic.keys())
# st.write('parameters:', para_overlap,para_abbr,para_threshold)
html_results=''
text_results=doc+'\n'
entity_end=0
hpoid_count={}
if len(tag_result)>=0:
for ele in tag_result:
entity_start=int(ele[0])
html_results+=doc[entity_end:entity_start]
entity_end=int(ele[1])
entity_id=ele[2]
entity_score=ele[3]
text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
if entity_id not in hpoid_count.keys():
hpoid_count[entity_id]=1
else:
hpoid_count[entity_id]+=1
html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
html_results+=doc[entity_end:]
else:
html_results=doc
st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)
#table
data_entity=[]
for ele in hpoid_count.keys():
temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count
data_entity.append(temp)
st.markdown("")
st.markdown("")
# st.markdown("## Table output:")
# cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
# with c1:
# CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
# with c2:
# CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
# with c3:
# CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
# st.header("")
df = (
DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"])
.sort_values(by="Frequency", ascending=False)
.reset_index(drop=True)
)
df.index += 1
c1, c2, c3 = st.columns([1, 4, 1])
# format_dictionary = {
# "Relevancy": "{:.1%}",
# }
# df = df.format(format_dictionary)
with c2:
st.table(df)
c1, c2, c3 = st.columns([1, 1, 1])
with c2:
st.download_button('Download annotations', text_results)