File size: 1,293 Bytes
ae5152f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 12 15:26:44 2020

@author: luol2
"""

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem.porter import PorterStemmer
lemmatizer = WordNetLemmatizer() 
stemmer = PorterStemmer()
import io
    
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R') or treebank_tag=='IN':
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def ssplit_token_pos_lemma(in_text):
    
    fout=io.StringIO()

    line=in_text.strip()
    line=line.replace('-',' - ').replace('/',' / ')
    sentences = nltk.sent_tokenize(line)
    sentences = [nltk.word_tokenize(sent) for sent in sentences] 
#    print(sentences)
    for sent in sentences:
        token_pos = nltk.pos_tag(sent)
        for token in token_pos:
            lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1]))
            stem = stemmer.stem(token[0].lower())
            fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n')
        fout.write('\n')
           
    return fout.getvalue()