Spaces:

asafd60
/

SpaceGen

Sleeping

App Files Files Community

SpaceGen / SpaceGen_preprocessing.py

asafd60

Upload 6 files

5eea398 verified 3 months ago

raw

history blame contribute delete

No virus

4.15 kB

	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import OneHotEncoder

	class SpaceGen_preprocessing:
	def __init__(self, content = "helloworld", size= 10, past_capacity = 5 , future_capacity = 5):
	self.size = size
	self.content = content[:self.size]
	self.past_capacity = past_capacity
	self.future_capacity = future_capacity
	self.num_features = self.past_capacity + self.future_capacity + 1 # 1 for letter
	self.vocabulary = []

	def create_vocabulary(self, correct_txt):
	'''
	Returns the unique letters of the given text + '-1'
	'''
	vocabulary = list({b for b in bytes(correct_txt, 'utf-8')})
	vocabulary.append(-1)
	vocabulary = sorted(vocabulary)
	self.vocabulary = vocabulary
	return None

	@staticmethod
	def create_decision_vector(W: list, C: list):
	'''
	Returns the Decision Vector(D),
	given Wrong Vector(W) and Correct Vector(C)
	'''
	D = []
	w_i = 0
	c_i = 0
	while w_i < len(W):
	if W[w_i] == C[c_i]:
	D.append('K')
	w_i += 1
	c_i += 1
	elif W[w_i] == 32 and C[c_i] != 32 :
	D.append('D')
	w_i += 1
	elif C[c_i] == 32 and W[w_i] != 32:
	D.append('I')
	c_i += 1
	w_i += 1
	else:
	c_i += 1
	return D


	@staticmethod
	def to_correct(W, D):
	'''
	Returns the correct text,
	given Wrong Vector(W) and Decision Vector(D)
	'''
	output_vec = []
	for i in range(0, len(D)):
	if D[i] == 'K':
	output_vec.append(W[i])
	elif D[i] == 'I':
	output_vec.append(32)
	output_vec.append(W[i])
	elif D[i] == 'D':
	pass
	decoded_text = bytes(output_vec).decode()
	return decoded_text


	@staticmethod
	def to_bytes_list(text: str, encoding = 'UTF-8'):
	'''
	Returns the bytes list of a given text
	'''
	return [b for b in bytes(text, encoding)]


	@staticmethod
	def to_one_hot_df(wrong_txt, D):
	'''
	Returns the one hot encoded dataframe,
	given Wrong Vector(W) and Decision Vector(D)
	'''
	df = pd.DataFrame({'letter':[l for l in wrong_txt],'decision':D})
	encoding = OneHotEncoder()
	y_matrix = encoding.fit_transform(df[['decision']])
	onehot_df = pd.DataFrame(y_matrix.toarray(), columns = encoding.get_feature_names_out(['decision']) )
	onehot_df = onehot_df.astype('int')
	example_df = pd.concat([df, onehot_df], axis=1)
	example_df =example_df.drop(['decision'], axis=1)
	return example_df


	@staticmethod
	def decode_vec(arr):
	'''
	Returns the decoded text,
	given the bytes list
	'''
	return bytes(arr).decode()


	@staticmethod
	def sliding_window_past(arr, window_size = 5):
	'''
	Returns the past sliding window of the given array and window size
	'''
	arr = list(arr)
	new_arr = []
	for i in range(len(arr)):
	start_window = max(0, i- window_size)
	tmp_seq = arr[start_window:i]
	if window_size - len(tmp_seq) ==0:
	new_arr.append(tmp_seq)
	else:
	new_arr.append([-1] * (window_size - len(tmp_seq)) + tmp_seq)
	return new_arr


	@staticmethod
	def sliding_window_future(arr, window_size = 5):
	'''
	Returns the future sliding window of the given array and window size
	'''
	arr = list(arr)
	seq = []
	for i in range(len(arr)):
	p = arr[i+1:i+window_size+1]
	if window_size - len(p) ==0:
	seq.append(p)
	else:
	seq.append(p + [-1] * (window_size - len(p)))
	return seq

	@staticmethod
	def insert_random_spaces(text, percent = .25):
	'''
	Returns the text with random spaces inserted
	'''
	l = list(text)
	rand_indices = np.random.randint(0, len(l)+1, int(np.round(len(l) * percent)))
	print(rand_indices)
	t = 1
	for i in range(len(l)+1):
	if i in rand_indices:
	l.insert(i + t, ' ')
	t+=1
	new_txt = ''.join(l).strip()
	return new_txt


	@staticmethod
	def prob_to_decision(a):
	'''
	Return I or K given probability vector
	'''
	if a[0] > a[1]:
	return 'I'
	else:
	return 'K'