Spaces:

wanderer2k1
/

VN_laws_qa

Sleeping

File size: 2,018 Bytes

9833a80

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on 

@author:  

@title: clean_dataset

@descriptions: set of functions that enable splitting and cleaning.
"""

#%%
import pandas as pd
import numpy as np
import string
from itertools import chain
from textwrap3 import wrap
import re

def split_at_length(dataframe, column, length, title = True):
    wrapped = []
    for i in dataframe[column]:
        wrapped.append(wrap(str(i), length))

    dataframe = dataframe.assign(wrapped=wrapped)
    dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))

    if title == True:
        splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )              
                            for _, row in dataframe.iterrows()]).reset_index()
        splitted = splitted.rename(columns={"index": "text", 0: "title"})

    else:
        splitted = []   

    
    
    return dataframe, splitted

def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()

def remove_linebreaks(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
    
    return s.strip()