import tweepy as tw import streamlit as st import pandas as pd import torch import numpy as np import re from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import AutoTokenizer, AutoModelForSequenceClassification,AdamW tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/twitter_sexismo-finetuned-exist2021-metwo') model = AutoModelForSequenceClassification.from_pretrained("hackathon-pln-es/twitter_sexismo-finetuned-exist2021-metwo") import torch if torch.cuda.is_available(): device = torch.device("cuda") print('I will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") consumer_key = st.secrets["consumer_key"] consumer_secret = st.secrets["consumer_secret"] access_token = st.secrets["access_token"] access_token_secret = st.secrets["access_token_secret"] auth = tw.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tw.API(auth, wait_on_rate_limit=True) st.title('Analisis de comentarios sexistas en Twitter con Tweepy and HuggingFace Transformers') st.markdown('Esta app utiliza tweepy para descargar tweets de twitter en base a la información de entrada y procesa los tweets usando transformers de HuggingFace para detectar comentarios sexistas. El resultado y los tweets correspondientes se almacenan en un dataframe para mostrarlo que es lo que se ve como resultado') def run(): with st.form(key='Introduzca nombre'): search_words = st.text_input('Introduzca el termino para analizar') number_of_tweets = st.number_input('Introduzca número de twweets a analizar. Máximo 50', 0,50,10) submit_button = st.form_submit_button(label='Submit') if submit_button: tweets =tw.Cursor(api.search_tweets,q=search_words).items(number_of_tweets) tweet_list = [i.text for i in tweets] text= pd.DataFrame(tweet_list) text1=text[0].values indices1=tokenizer.batch_encode_plus(text1.tolist(), max_length=128, add_special_tokens=True, return_attention_mask=True, pad_to_max_length=True, truncation=True) input_ids1=indices1["input_ids"] attention_masks1=indices1["attention_mask"] prediction_inputs1= torch.tensor(input_ids1) prediction_masks1 = torch.tensor(attention_masks1) # Set the batch size. batch_size = 25 # Create the DataLoader. prediction_data1 = TensorDataset(prediction_inputs1, prediction_masks1) prediction_sampler1 = SequentialSampler(prediction_data1) prediction_dataloader1 = DataLoader(prediction_data1, sampler=prediction_sampler1, batch_size=batch_size) print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs1))) # Put model in evaluation mode model.eval() # Tracking variables predictions = [] # Predict for batch in prediction_dataloader1: batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids1, b_input_mask1 = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs1 = model(b_input_ids1, token_type_ids=None,attention_mask=b_input_mask1) logits1 = outputs1[0] # Move logits and labels to CPU logits1 = logits1.detach().cpu().numpy() # Store predictions and true labels predictions.append(logits1) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten()#p = [i for i in classifier(tweet_list)] df = pd.DataFrame(list(zip(tweet_list, flat_predictions)),columns =['Latest'+str(number_of_tweets)+'Tweets'+' on '+search_words, 'Sexista']) df['Sexista']= np.where(df['Sexista']== 0, 'No Sexista', 'Sexista') st.table(df) #st.write(df) run()