Spaces:

robertou2
/

sexismdetector

Runtime error

App Files Files Community

robertou2 commited on Mar 30, 2022

Commit

88e5a9f

•

1 Parent(s): e52a4dc

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -9

app.py CHANGED Viewed

@@ -27,13 +27,32 @@ auth = tw.OAuthHandler(consumer_key, consumer_secret)
 auth.set_access_token(access_token, access_token_secret)
 api = tw.API(auth, wait_on_rate_limit=True)
-def strip_undesired_chars(tweet):
-    stripped_tweet = tweet.replace('\n', ' ').replace('\r', '')
-    char_list = [stripped_tweet[j] for j in range(len(stripped_tweet)) if ord(stripped_tweet[j]) in range(65536)]
-    stripped_tweet=''
-    for j in char_list:
-        stripped_tweet=stripped_tweet+j
-    return stripped_tweet
 st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
@@ -50,9 +69,10 @@ def run():
             #tweets = tweepy.Cursor(api.search,q=new_search,lang="es",since=date_since).items(number_of_tweets)
             #tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
             tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
-            #tweet_list = [i.text for i in tweets]
-            tweet_list = [strip_undesired_chars(i.text) for i in tweets]
             text= pd.DataFrame(tweet_list)
             text1=text[0].values
             indices1=tokenizer.batch_encode_plus(text1.tolist(),
                                      max_length=128,

 auth.set_access_token(access_token, access_token_secret)
 api = tw.API(auth, wait_on_rate_limit=True)
+def preprocess(text):
+    text=text.lower()
+    # remove hyperlinks
+    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
+    text = re.sub(r'http?:\/\/.*[\r\n]*', '', text)
+    #Replace &amp, &lt, &gt with &,<,> respectively
+    text=text.replace(r'&amp;?',r'and')
+    text=text.replace(r'&lt;',r'<')
+    text=text.replace(r'&gt;',r'>')
+    #remove hashtag sign
+    #text=re.sub(r"#","",text)
+    #remove mentions
+    text = re.sub(r"(?:\@)\w+", '', text)
+    #text=re.sub(r"@","",text)
+    #remove non ascii chars
+    text=text.encode("ascii",errors="ignore").decode()
+    #remove some puncts (except . ! ?)
+    text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text)
+    text=re.sub(r'[!]+','!',text)
+    text=re.sub(r'[?]+','?',text)
+    text=re.sub(r'[.]+','.',text)
+    text=re.sub(r"'","",text)
+    text=re.sub(r"\(","",text)
+    text=re.sub(r"\)","",text)
+    text=" ".join(text.split())
+    return text
 st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers')
             #tweets = tweepy.Cursor(api.search,q=new_search,lang="es",since=date_since).items(number_of_tweets)
             #tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets)
             tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets)
+            tweet_list = [i.text for i in tweets]
+            #tweet_list = [strip_undesired_chars(i.text) for i in tweets]
             text= pd.DataFrame(tweet_list)
+            text[0] = text[0].apply(preprocess)
             text1=text[0].values
             indices1=tokenizer.batch_encode_plus(text1.tolist(),
                                      max_length=128,