PhenoTagger-Demo / src /PhenoTagger_tagging.py
Ling Luo
add file
273b73b
raw
history blame
5.78 kB
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 09:20:38 2020
@author: luol2
"""
import argparse
from nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
from dic_ner import dic_ont
from tagging_text import bioTag
import os
import time
import json
import sys
def file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set):
with open(outfile,'w', encoding='utf8') as fout:
fin=open(infile,'r',encoding='utf-8')
all_context=fin.read().strip().split('\n\n')
N=len(all_context)
fin.close()
i=1
for doc in all_context:
print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
i+=1
lines=doc.split('\n')
pmid=''
title=''
abstract=''
for line in lines:
seg_t=line.split('|t|')
seg_a=line.split('|a|')
if len(seg_t)>=2:
pmid=seg_t[0]
title=seg_t[1]
elif len(seg_a)>=2:
pmid=seg_a[0]
abstract=seg_a[1]
if pmid !='':
fout.write(pmid+"|t|"+title+"\n")
fout.write(pmid+"|a|"+abstract+"\n")
intext=title+' '+abstract
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
for ele in tag_result:
start = ele[0]
last = ele[1]
mention = intext[int(ele[0]):int(ele[1])]
type='Phenotype'
id=ele[2]
score=ele[3]
fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\n")
#fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\t"+score+"\n")
fout.write('\n')
def path_tag_hybrid(inpath,outpath,biotag_dic,nn_model,para_set):
i=1
N=0
preds_result={}
for filename in os.listdir(inpath):
N+=1
for filename in os.listdir(inpath):
print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
i+=1
pmid=filename
fin=open(inpath+filename,'r',encoding='utf-8')
intext=fin.read().rstrip()
fin.close()
temp_result=[]
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
fout=open(outpath+filename,'w',encoding='utf-8')
for ele in tag_result:
fout.write('\t'.join([ele[0],ele[1],ele[2],ele[3],intext[int(ele[0]):int(ele[1])]])+'\n')
fout.close()
def phecr_tag(infile,para_set,outfile):
ontfiles={'dic_file':'../dict_new/noabb_lemma.dic',
'word_hpo_file':'../dict_new/word_id_map.json',
'hpo_word_file':'../dict_new/id_word_map.json'}
if para_set['model_type']=='cnn':
vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
'charfile':'../vocab/char.vocab',
'labelfile':'../dict_new/lable.vocab',
'posfile':'../vocab/pos.vocab'}
modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
elif para_set['model_type']=='bioformer':
vocabfiles={'labelfile':'../dict_new/lable.vocab',
'config_path':'../vocab/bioformer-cased-v1.0/bert_config.json',
'checkpoint_path':'../vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
'vocab_path':'../vocab/bioformer-cased-v1.0/vocab.txt'}
modelfile='../models/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
else:
print('Model type is wrong, please select cnn or bioformer.')
sys.exit()
biotag_dic=dic_ont(ontfiles)
if para_set['model_type']=='cnn':
nn_model=bioTag_CNN(vocabfiles)
nn_model.load_model(modelfile)
elif para_set['model_type']=='bioformer':
nn_model=bioTag_Bioformer(vocabfiles)
nn_model.load_model(modelfile)
if os.path.isdir(infile):
print("Input a directory:",infile,'\n....tagging begin....')
start_time=time.time()
path_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
print('tagging done, processing time:',time.time()-start_time)
elif os.path.isfile(infile):
print("Input a file:", infile, '\n....tagging begin....')
start_time=time.time()
file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
print('tagging done, processing time:',time.time()-start_time)
if __name__=="__main__":
parser = argparse.ArgumentParser(description='Tagging free text with PhenoTagger, python PhenoTagger_tagging.py -i infile/inpath -o outfile/outpath')
parser.add_argument('--infile', '-i', help="the input file or path",default='../example/ex2/')
parser.add_argument('--outfile', '-o', help="the output file or path",default='../example/ex2_output/')
args = parser.parse_args()
if (os.path.isdir(args.infile)) and (not os.path.exists(args.outfile)):
os.makedirs(args.outfile)
para_set={
'model_type':'bioformer', # cnn or bioformer
'onlyLongest':True, # False: return overlap concepts, True only longgest
'abbrRecog':True,# False: don't identify abbr, True: identify abbr
'ML_Threshold':0.95,# the Threshold of deep learning model
}
phecr_tag(args.infile,para_set,args.outfile)