import tweepy as tw import streamlit as st import pandas as pd import torch import numpy as np import re from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import AutoTokenizer, AutoModelForSequenceClassification,AdamW tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/twitter_sexismo-finetuned-exist2021-metwo') model = AutoModelForSequenceClassification.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-exist2021-metwo") import torch if torch.cuda.is_available(): device = torch.device("cuda") print('I will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") consumer_key = st.secrets["consumer_key"] consumer_secret = st.secrets["consumer_secret"] access_token = st.secrets["access_token"] access_token_secret = st.secrets["access_token_secret"] auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) def preprocess(text): text=text.lower() # remove hyperlinks text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) text = re.sub(r'http?:\/\/.*[\r\n]*', '', text) #Replace &, <, > with &,<,> respectively text=text.replace(r'&?',r'and') text=text.replace(r'<',r'<') text=text.replace(r'>',r'>') #remove hashtag sign #text=re.sub(r"#","",text) #remove mentions text = re.sub(r"(?:\@)\w+", '', text) #text=re.sub(r"@","",text) #remove non ascii chars text=text.encode("ascii",errors="ignore").decode() #remove some puncts (except . ! ?) text=re.sub(r'[:"#$%&\*+,-/:;<=>@\\^_`{|}~]+','',text) text=re.sub(r'[!]+','!',text) text=re.sub(r'[?]+','?',text) text=re.sub(r'[.]+','.',text) text=re.sub(r"'","",text) text=re.sub(r"\(","",text) text=re.sub(r"\)","",text) text=" ".join(text.split()) return text st.set_page_config(layout="wide") colT1,colT2 = st.columns([2,7]) with colT2: st.title('Analisis de comentarios sexistas en Twitter') st.header('Objetivo 5 de los ODS, Lograr la igualdad entre los géneros y empoderar a todas las mujeres y las niñas') with colT1: st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/c/c7/Sustainable_Development_Goal-es-13.jpg/1200px-Sustainable_Development_Goal-es-13.jpg",width=100) st.markdown('Esta app utiliza tweepy para descargar tweets de twitter en base a la información de entrada y procesa los tweets usando transformers de HuggingFace para detectar comentarios sexistas. El resultado y los tweets correspondientes se almacenan en un dataframe para mostrarlo que es lo que se ve como resultado') st.markdown('La finalidad del proyecto es en línea con el Objetivo 5 de los ODS, Lograr la igualdad entre los géneros y empoderar a todas las mujeres y las niñas.Una vez analizados los tweets, los que resulten sexistas se pueden contestar para intentar reeducar a las personas que bien por su educación o circustancias presenten un comportamiento contrario al del Objetivo 5 antes mencionado. Igualmente en casos más graves se pueden realizar otras acciones') def run(): with st.form(key='Introduzca Texto'): col,buff1, buff2 = st.columns([2,2,1]) #col.text_input('smaller text window:') search_words = col.text_input('Introduzca el termino o usuario para analizar y pulse el check ') number_of_tweets = col.number_input('Introduzca número de twweets a analizar. Máximo 50', 0,50,10) termino=st.checkbox('Término') usuario=st.checkbox('Usuario') submit_button = col.form_submit_button(label='Analizar') error=False if submit_button: date_since = "2020-09-14" if ( termino == False and usuario == False): st.text('Error no se ha seleccionado ningun check') error=True elif ( termino == True and usuario == True): st.text('Error se han seleccionado los dos check') error=True if (error == False): if (termino): new_search = search_words + " -filter:retweets" tweets =tw.Cursor(api.search_tweets,q=new_search,lang="es",since=date_since).items(number_of_tweets) elif (usuario): tweets = api.user_timeline(screen_name = search_words,count=number_of_tweets) tweet_list = [i.text for i in tweets] #tweet_list = [strip_undesired_chars(i.text) for i in tweets] text= pd.DataFrame(tweet_list) text[0] = text[0].apply(preprocess) text1=text[0].values indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, truncation=True) input_ids1=indices1["input_ids"] attention_masks1=indices1["attention_mask"] prediction_inputs1= torch.tensor(input_ids1) prediction_masks1 = torch.tensor(attention_masks1) # Set the batch size. batch_size = 25 # Create the DataLoader. prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1) prediction_sampler1 = SequentialSampler(prediction_data1) prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size) print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1))) # Put model in evaluation mode model.eval() # Tracking variables predictions = [] # Predict for batch in prediction_dataloader1: batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids1, b_input_mask1 = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1) logits1 = outputs1[0] # Move logits and labels to CPU logits1 = logits1.detach().cpu().numpy() # Store predictions and true labels predictions.append(logits1) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten()#p = [i for i in classifier(tweet_list)] df = pd.DataFrame(list(zip(tweet_list, flat_predictions)),columns =['Latest'+str(number_of_tweets)+'Tweets'+' on '+search_words, 'Sexista']) df['Sexista']= np.where(df['Sexista']== 0, 'No Sexista', 'Sexista') st.table(df) #st.write(df) run()