# -*- coding: utf-8 -*- """ Created on Mon Nov 21 16:21:25 2022 @author: luol2 """ import streamlit as st from src.nn_model import bioTag_CNN,bioTag_Bioformer from src.dic_ner import dic_ont from src.tagging_text import bioTag import os import json from pandas import DataFrame import nltk nltk.download('punkt') nltk.download('averaged_perceptron_tagger') nltk.download('wordnet') st.set_page_config( page_title="PhenoTagger", page_icon="🎈", layout="wide", menu_items={ 'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/', 'About': "PhenoTagger v1.1" } ) # def _max_width_(): # max_width_str = f"max-width: 2400px;" # st.markdown( # f""" # # """, # unsafe_allow_html=True, # ) # _max_width_() # c30, c31, c32 = st.columns([2.5, 1, 3]) # with c30: # # st.image("logo.png", width=400) st.title("👨‍⚕️ PhenoTagger Demo") with st.expander("ℹ️ - About this app", expanded=True): st.write( """ - This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library! - PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details. - Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/) """ ) st.markdown("") st.markdown("") st.markdown("## 📌 Paste document ") with st.form(key="my_form"): ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07]) with c1: ModelType = st.radio( "Choose your model", ["Bioformer(Default)", "CNN"], help="Bioformer is more precise, CNN is more efficient", ) if ModelType == "Bioformer(Default)": # kw_model = KeyBERT(model=roberta) @st.cache(allow_output_mutation=True) def load_model(): ontfiles={'dic_file':'./dict_new/noabb_lemma.dic', 'word_hpo_file':'./dict_new/word_id_map.json', 'hpo_word_file':'./dict_new/id_word_map.json'} vocabfiles={'labelfile':'./dict_new/lable.vocab', 'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json', 'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000', 'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'} modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5' biotag_dic=dic_ont(ontfiles) nn_model=bioTag_Bioformer(vocabfiles) nn_model.load_model(modelfile) return nn_model,biotag_dic nn_model,biotag_dic = load_model() else: @st.cache(allow_output_mutation=True) def load_model(): ontfiles={'dic_file':'./dict_new/noabb_lemma.dic', 'word_hpo_file':'./dict_new/word_id_map.json', 'hpo_word_file':'./dict_new/id_word_map.json'} vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200', 'charfile':'./vocab/char.vocab', 'labelfile':'./dict_new/lable.vocab', 'posfile':'./vocab/pos.vocab'} modelfile='./models/cnn_p5n5_b128_95_hponew1.h5' biotag_dic=dic_ont(ontfiles) nn_model=bioTag_CNN(vocabfiles) nn_model.load_model(modelfile) return nn_model,biotag_dic nn_model,biotag_dic = load_model() para_overlap = st.checkbox( "Overlap concept", value=True, help="Tick this box to identify overlapping concepts", ) para_abbr = st.checkbox( "Abbreviaitons", value=True, help="Tick this box to identify abbreviations", ) para_threshold = st.slider( "Threshold", min_value=0.5, max_value=1.0, value=0.95, step=0.05, help="Retrun the preditions which socre over the threshold.", ) with c2: doc = st.text_area( "Paste your text below", height=400, ) # MAX_WORDS = 500 # import re # res = len(re.findall(r"\w+", doc)) # if res > MAX_WORDS: # st.warning( # "⚠️ Your text contains " # + str(res) # + " words." # + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊" # ) # doc = doc[:MAX_WORDS] submit_button = st.form_submit_button(label="✨ Submit!") if not submit_button: st.stop() para_set={ #model_type':para_model, # cnn or bioformer 'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest 'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr 'ML_Threshold':para_threshold,# the Threshold of deep learning model } st.markdown("") st.markdown("## 💡 Tagging results:") with st.spinner('Wait for tagging...'): tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) st.markdown('Move the mouse over the entity to display the HPO id.', unsafe_allow_html=True) # print('dic...........:',biotag_dic.keys()) # st.write('parameters:', para_overlap,para_abbr,para_threshold) html_results='' text_results=doc+'\n' entity_end=0 hpoid_count={} if len(tag_result)>=0: for ele in tag_result: entity_start=int(ele[0]) html_results+=doc[entity_end:entity_start] entity_end=int(ele[1]) entity_id=ele[2] entity_score=ele[3] text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n' if entity_id not in hpoid_count.keys(): hpoid_count[entity_id]=1 else: hpoid_count[entity_id]+=1 html_results+=''+doc[entity_start:entity_end]+'' html_results+=doc[entity_end:] else: html_results=doc st.markdown('

'+html_results+'

', unsafe_allow_html=True) #table data_entity=[] for ele in hpoid_count.keys(): temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count data_entity.append(temp) st.markdown("") st.markdown("") # st.markdown("## Table output:") # cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2]) # with c1: # CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)") # with c2: # CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)") # with c3: # CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)") # st.header("") df = ( DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"]) .sort_values(by="Frequency", ascending=False) .reset_index(drop=True) ) df.index += 1 c1, c2, c3 = st.columns([1, 4, 1]) # format_dictionary = { # "Relevancy": "{:.1%}", # } # df = df.format(format_dictionary) with c2: st.table(df) c1, c2, c3 = st.columns([1, 1, 1]) with c2: st.download_button('Download annotations', text_results)