robertou2 commited on
Commit
88e5a9f
1 Parent(s): e52a4dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -9
app.py CHANGED
@@ -27,13 +27,32 @@ auth = tw.OAuthHandler(consumer_key, consumer_secret)
27
  auth.set_access_token(access_token, access_token_secret)
28
  api = tw.API(auth, wait_on_rate_limit=True)
29
 
30
- def strip_undesired_chars(tweet):
31
- stripped_tweet = tweet.replace('\n', ' ').replace('\r', '')
32
- char_list = [stripped_tweet[j] for j in range(len(stripped_tweet)) if ord(stripped_tweet[j]) in range(65536)]
33
- stripped_tweet=''
34
- for j in char_list:
35
- stripped_tweet=stripped_tweet+j
36
- return stripped_tweet
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
@@ -50,9 +69,10 @@ def run():
50
  #tweets = tweepy.Cursor(api.search,q=new_search,lang="es",since=date_since).items(number_of_tweets)
51
  #tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
52
  tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
53
- #tweet_list = [i.text for i in tweets]
54
- tweet_list = [strip_undesired_chars(i.text) for i in tweets]
55
  text= pd.DataFrame(tweet_list)
 
56
  text1=text[0].values
57
  indices1=tokenizer.batch_encode_plus(text1.tolist(),
58
  max_length=128,
 
27
  auth.set_access_token(access_token, access_token_secret)
28
  api = tw.API(auth, wait_on_rate_limit=True)
29
 
30
+ def preprocess(text):
31
+ text=text.lower()
32
+ # remove hyperlinks
33
+ text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
34
+ text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
35
+ #Replace &amp, &lt, &gt with &,<,> respectively
36
+ text=text.replace(r'&amp;?',r'and')
37
+ text=text.replace(r'&lt;',r'<')
38
+ text=text.replace(r'&gt;',r'>')
39
+ #remove hashtag sign
40
+ #text=re.sub(r"#","",text)
41
+ #remove mentions
42
+ text = re.sub(r"(?:\@)\w+", '', text)
43
+ #text=re.sub(r"@","",text)
44
+ #remove non ascii chars
45
+ text=text.encode("ascii",errors="ignore").decode()
46
+ #remove some puncts (except . ! ?)
47
+ text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
48
+ text=re.sub(r'[!]+','!',text)
49
+ text=re.sub(r'[?]+','?',text)
50
+ text=re.sub(r'[.]+','.',text)
51
+ text=re.sub(r"'","",text)
52
+ text=re.sub(r"\(","",text)
53
+ text=re.sub(r"\)","",text)
54
+ text=" ".join(text.split())
55
+ return text
56
 
57
 
58
  st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
 
69
  #tweets = tweepy.Cursor(api.search,q=new_search,lang="es",since=date_since).items(number_of_tweets)
70
  #tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
71
  tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
72
+ tweet_list = [i.text for i in tweets]
73
+ #tweet_list = [strip_undesired_chars(i.text) for i in tweets]
74
  text= pd.DataFrame(tweet_list)
75
+ text[0] = text[0].apply(preprocess)
76
  text1=text[0].values
77
  indices1=tokenizer.batch_encode_plus(text1.tolist(),
78
  max_length=128,