Spaces:

lingbionlp
/

PhenoTagger-Demo

Build error

File size: 5,782 Bytes

273b73b

# -*- coding: utf-8 -*-
"""

Created on Thu Aug 13 09:20:38 2020



@author: luol2

"""


import argparse
from nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
from dic_ner import dic_ont
from tagging_text import bioTag
import os
import time
import json
import sys



def file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set):
    
    with open(outfile,'w', encoding='utf8') as fout:
        fin=open(infile,'r',encoding='utf-8')
        all_context=fin.read().strip().split('\n\n')
        N=len(all_context)
        fin.close()
    
        i=1
        for doc in all_context:
            print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
            i+=1
            lines=doc.split('\n')
            pmid=''
            title=''
            abstract=''
            for line in lines:
                seg_t=line.split('|t|')
                seg_a=line.split('|a|')
                if len(seg_t)>=2:
                    pmid=seg_t[0]
                    title=seg_t[1]
                elif len(seg_a)>=2:
                    pmid=seg_a[0]
                    abstract=seg_a[1]
            if pmid !='':
                fout.write(pmid+"|t|"+title+"\n")
                fout.write(pmid+"|a|"+abstract+"\n")
                intext=title+' '+abstract
                tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
                for ele in tag_result:
                    start = ele[0]
                    last = ele[1]
                    mention = intext[int(ele[0]):int(ele[1])]
                    type='Phenotype'
                    id=ele[2]
                    score=ele[3]
                    fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\n")
                    #fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\t"+score+"\n")
                fout.write('\n')

def path_tag_hybrid(inpath,outpath,biotag_dic,nn_model,para_set):

    
    i=1
    N=0
    preds_result={}
    for filename in os.listdir(inpath):
        N+=1
    for filename in os.listdir(inpath):
        print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
        i+=1
        pmid=filename 
        fin=open(inpath+filename,'r',encoding='utf-8')
        intext=fin.read().rstrip()
        fin.close()   
        temp_result=[]
        tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
        fout=open(outpath+filename,'w',encoding='utf-8')
        for ele in tag_result:
            fout.write('\t'.join([ele[0],ele[1],ele[2],ele[3],intext[int(ele[0]):int(ele[1])]])+'\n')
        fout.close()



def phecr_tag(infile,para_set,outfile):
    
    ontfiles={'dic_file':'../dict_new/noabb_lemma.dic',
              'word_hpo_file':'../dict_new/word_id_map.json',
              'hpo_word_file':'../dict_new/id_word_map.json'}
    
    if para_set['model_type']=='cnn':
        vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',   
                    'charfile':'../vocab/char.vocab',
                    'labelfile':'../dict_new/lable.vocab',
                    'posfile':'../vocab/pos.vocab'}
        modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
    
    elif para_set['model_type']=='bioformer':
        vocabfiles={'labelfile':'../dict_new/lable.vocab',
                    'config_path':'../vocab/bioformer-cased-v1.0/bert_config.json',
                    'checkpoint_path':'../vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
                    'vocab_path':'../vocab/bioformer-cased-v1.0/vocab.txt'}
        modelfile='../models/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
    else:
        print('Model type is wrong, please select cnn or bioformer.')
        sys.exit()
    

    biotag_dic=dic_ont(ontfiles)    

    if para_set['model_type']=='cnn':
        nn_model=bioTag_CNN(vocabfiles)
        nn_model.load_model(modelfile)
    elif para_set['model_type']=='bioformer':
        nn_model=bioTag_Bioformer(vocabfiles)
        nn_model.load_model(modelfile)
    
    if os.path.isdir(infile):
        print("Input a directory:",infile,'\n....tagging begin....')
        start_time=time.time()
        path_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
        print('tagging done, processing time:',time.time()-start_time)
    elif os.path.isfile(infile):
        print("Input a file:", infile, '\n....tagging begin....')
        start_time=time.time()
        file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
        print('tagging done, processing time:',time.time()-start_time)
    

if __name__=="__main__":
    
    parser = argparse.ArgumentParser(description='Tagging free text with PhenoTagger, python PhenoTagger_tagging.py -i infile/inpath -o outfile/outpath')
    parser.add_argument('--infile', '-i', help="the input file or path",default='../example/ex2/')
    parser.add_argument('--outfile', '-o', help="the output file or path",default='../example/ex2_output/')
    args = parser.parse_args()
    if (os.path.isdir(args.infile)) and (not os.path.exists(args.outfile)):
        os.makedirs(args.outfile)
    para_set={
              'model_type':'bioformer', # cnn or bioformer
              'onlyLongest':True, # False: return overlap concepts, True only longgest
              'abbrRecog':True,# False: don't identify abbr, True: identify abbr
              'ML_Threshold':0.95,# the Threshold of deep learning model
              }
    
    
    phecr_tag(args.infile,para_set,args.outfile)