File size: 825 Bytes
a50f42c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from typing import List

import spacy

from util.process_data import Token, Sample, SampleList

class Tokenizer():

    def __init__(self, spacy_model: str):
        self.__spacy_model = spacy.load(spacy_model)

    def run(self, sample_list: SampleList):
        self.__tokenize(sample_list.samples, self.__spacy_model)

    def __tokenize(self, samples: List[Sample], spacy_model):
        doc_pipe = spacy_model.pipe([sample.text.replace('\xa0', ' ') for sample in samples])
        for sample, doc in zip(samples, doc_pipe):
            sample.tokens = [Token(
                text=x.text,
                start=x.idx,
                end=x.idx + len(x.text)
            ) for x in doc]
            while '\n' in sample.tokens[-1].text or ' ' in sample.tokens[-1].text:
                sample.tokens = sample.tokens[:-1]