# -*- coding: utf-8 -*- """ Created on Thu Aug 13 09:20:38 2020 @author: luol2 """ import argparse from nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer from dic_ner import dic_ont from tagging_text import bioTag import os import time import json import sys def file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set): with open(outfile,'w', encoding='utf8') as fout: fin=open(infile,'r',encoding='utf-8') all_context=fin.read().strip().split('\n\n') N=len(all_context) fin.close() i=1 for doc in all_context: print("Processing:{0}%".format(round(i * 100 / N)), end="\r") i+=1 lines=doc.split('\n') pmid='' title='' abstract='' for line in lines: seg_t=line.split('|t|') seg_a=line.split('|a|') if len(seg_t)>=2: pmid=seg_t[0] title=seg_t[1] elif len(seg_a)>=2: pmid=seg_a[0] abstract=seg_a[1] if pmid !='': fout.write(pmid+"|t|"+title+"\n") fout.write(pmid+"|a|"+abstract+"\n") intext=title+' '+abstract tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) for ele in tag_result: start = ele[0] last = ele[1] mention = intext[int(ele[0]):int(ele[1])] type='Phenotype' id=ele[2] score=ele[3] fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\n") #fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\t"+score+"\n") fout.write('\n') def path_tag_hybrid(inpath,outpath,biotag_dic,nn_model,para_set): i=1 N=0 preds_result={} for filename in os.listdir(inpath): N+=1 for filename in os.listdir(inpath): print("Processing:{0}%".format(round(i * 100 / N)), end="\r") i+=1 pmid=filename fin=open(inpath+filename,'r',encoding='utf-8') intext=fin.read().rstrip() fin.close() temp_result=[] tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) fout=open(outpath+filename,'w',encoding='utf-8') for ele in tag_result: fout.write('\t'.join([ele[0],ele[1],ele[2],ele[3],intext[int(ele[0]):int(ele[1])]])+'\n') fout.close() def phecr_tag(infile,para_set,outfile): ontfiles={'dic_file':'../dict_new/noabb_lemma.dic', 'word_hpo_file':'../dict_new/word_id_map.json', 'hpo_word_file':'../dict_new/id_word_map.json'} if para_set['model_type']=='cnn': vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200', 'charfile':'../vocab/char.vocab', 'labelfile':'../dict_new/lable.vocab', 'posfile':'../vocab/pos.vocab'} modelfile='../models/cnn_p5n5_b128_95_hponew1.h5' elif para_set['model_type']=='bioformer': vocabfiles={'labelfile':'../dict_new/lable.vocab', 'config_path':'../vocab/bioformer-cased-v1.0/bert_config.json', 'checkpoint_path':'../vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000', 'vocab_path':'../vocab/bioformer-cased-v1.0/vocab.txt'} modelfile='../models/bioformer_p5n5_b64_1e-5_95_hponew3.h5' else: print('Model type is wrong, please select cnn or bioformer.') sys.exit() biotag_dic=dic_ont(ontfiles) if para_set['model_type']=='cnn': nn_model=bioTag_CNN(vocabfiles) nn_model.load_model(modelfile) elif para_set['model_type']=='bioformer': nn_model=bioTag_Bioformer(vocabfiles) nn_model.load_model(modelfile) if os.path.isdir(infile): print("Input a directory:",infile,'\n....tagging begin....') start_time=time.time() path_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set) print('tagging done, processing time:',time.time()-start_time) elif os.path.isfile(infile): print("Input a file:", infile, '\n....tagging begin....') start_time=time.time() file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set) print('tagging done, processing time:',time.time()-start_time) if __name__=="__main__": parser = argparse.ArgumentParser(description='Tagging free text with PhenoTagger, python PhenoTagger_tagging.py -i infile/inpath -o outfile/outpath') parser.add_argument('--infile', '-i', help="the input file or path",default='../example/ex2/') parser.add_argument('--outfile', '-o', help="the output file or path",default='../example/ex2_output/') args = parser.parse_args() if (os.path.isdir(args.infile)) and (not os.path.exists(args.outfile)): os.makedirs(args.outfile) para_set={ 'model_type':'bioformer', # cnn or bioformer 'onlyLongest':True, # False: return overlap concepts, True only longgest 'abbrRecog':True,# False: don't identify abbr, True: identify abbr 'ML_Threshold':0.95,# the Threshold of deep learning model } phecr_tag(args.infile,para_set,args.outfile)