Spaces:

wanderer2k1
/

VN_laws_qa

Sleeping

App Files Files Community

VN_laws_qa / src /clean_dataset.py

wanderer2k1

9833a80 about 1 year ago

raw

history blame

No virus

2.02 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on

	@author:

	@title: clean_dataset

	@descriptions: set of functions that enable splitting and cleaning.
	"""

	#%%
	import pandas as pd
	import numpy as np
	import string
	from itertools import chain
	from textwrap3 import wrap
	import re

	def split_at_length(dataframe, column, length, title = True):
	wrapped = []
	for i in dataframe[column]:
	wrapped.append(wrap(str(i), length))

	dataframe = dataframe.assign(wrapped=wrapped)
	dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))

	if title == True:
	splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )
	for _, row in dataframe.iterrows()]).reset_index()
	splitted = splitted.rename(columns={"index": "text", 0: "title"})

	else:
	splitted = []



	return dataframe, splitted

	def basic(s):
	"""
	:param s: string to be processed
	:return: processed string: see comments in the source code for more info
	"""
	# Text Lowercase
	s = s.lower()
	# Remove punctuation
	translator = str.maketrans(' ', ' ', string.punctuation)
	s = s.translate(translator)
	# Remove URLs
	s = re.sub(r'^https?:\/\/.[\r\n]', ' ', s, flags=re.MULTILINE)
	s = re.sub(r"http\S+", " ", s)
	# Remove new line characters
	s = re.sub('\n', ' ', s)

	# Remove distracting single quotes
	s = re.sub("\'", " ", s)
	# Remove all remaining numbers and non alphanumeric characters
	s = re.sub(r'\d+', ' ', s)
	s = re.sub(r'\W+', ' ', s)

	# define custom words to replace:
	#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)

	return s.strip()

	def remove_linebreaks(s):
	"""
	:param s: string to be processed
	:return: processed string: see comments in the source code for more info
	"""
	# Remove new line characters
	s = re.sub('\n', ' ', s)

	return s.strip()