File size: 4,150 Bytes
5eea398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

class SpaceGen_preprocessing:
  def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
    self.size = size
    self.content = content[:self.size]
    self.past_capacity = past_capacity
    self.future_capacity = future_capacity
    self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
    self.vocabulary = []

  def create_vocabulary(self, correct_txt):
    '''
    Returns the unique letters of the given text + '-1'
    '''
    vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
    vocabulary.append(-1)
    vocabulary = sorted(vocabulary)
    self.vocabulary = vocabulary
    return None

  @staticmethod
  def create_decision_vector(W: list, C: list):
    '''
    Returns the Decision Vector(D),
    given Wrong Vector(W) and Correct Vector(C)
    '''
    D = []
    w_i = 0
    c_i = 0
    while w_i < len(W):
      if W[w_i] == C[c_i]:
          D.append('K')
          w_i += 1
          c_i += 1
      elif W[w_i] == 32 and C[c_i] != 32 :
          D.append('D')
          w_i += 1
      elif C[c_i] == 32 and W[w_i] != 32:
          D.append('I')
          c_i += 1
          w_i += 1
      else:
          c_i += 1
    return D


  @staticmethod
  def to_correct(W, D):
      '''
      Returns the correct text,
      given Wrong Vector(W) and Decision Vector(D)
      '''
      output_vec = []
      for i in range(0, len(D)):
        if D[i] == 'K':
          output_vec.append(W[i])
        elif D[i] == 'I':
          output_vec.append(32)
          output_vec.append(W[i])
        elif D[i] == 'D':
          pass
      decoded_text = bytes(output_vec).decode()
      return decoded_text


  @staticmethod
  def to_bytes_list(text: str, encoding = 'UTF-8'):
      '''
      Returns the bytes list of a given text
      '''
      return [b for b in bytes(text, encoding)]


  @staticmethod
  def to_one_hot_df(wrong_txt, D):
    '''
    Returns the one hot encoded dataframe,
    given Wrong Vector(W) and Decision Vector(D)
    '''
    df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
    encoding =  OneHotEncoder()
    y_matrix =  encoding.fit_transform(df[['decision']])
    onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
    onehot_df = onehot_df.astype('int')
    example_df = pd.concat([df, onehot_df], axis=1)
    example_df =example_df.drop(['decision'], axis=1)
    return example_df


  @staticmethod
  def decode_vec(arr):
    '''
    Returns the decoded text,
    given the bytes list
    '''
    return bytes(arr).decode()


  @staticmethod
  def sliding_window_past(arr, window_size = 5):
    '''
    Returns the past sliding window of the given array and window size
    '''
    arr = list(arr)
    new_arr = []
    for i in range(len(arr)):
      start_window = max(0, i- window_size)
      tmp_seq = arr[start_window:i]
      if window_size - len(tmp_seq) ==0:
        new_arr.append(tmp_seq)
      else:
        new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
    return new_arr


  @staticmethod
  def sliding_window_future(arr, window_size = 5):
    '''
    Returns the future sliding window of the given array and window size
    '''
    arr = list(arr)
    seq = []
    for i in range(len(arr)):
      p = arr[i+1:i+window_size+1]
      if window_size - len(p) ==0:
        seq.append(p)
      else:
        seq.append(p + [-1] * (window_size - len(p)))
    return seq

  @staticmethod
  def insert_random_spaces(text, percent = .25):
    '''
    Returns the text with random spaces inserted
    '''
    l = list(text)
    rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
    print(rand_indices)
    t = 1
    for i in range(len(l)+1):
      if i in rand_indices:
          l.insert(i + t, ' ')
          t+=1
    new_txt = ''.join(l).strip()
    return new_txt


  @staticmethod
  def prob_to_decision(a):
    '''
    Return I or K given probability vector
    '''
    if a[0] > a[1]:
      return 'I'
    else:
      return 'K'