File size: 4,308 Bytes
03d6e86
 
 
 
 
caea1f5
ede2957
03d6e86
bafee93
57c5821
49124ad
7413ee9
 
 
de6d203
57c5821
49124ad
57c5821
5f82549
 
ede2957
91e5f2f
ede2957
 
 
 
 
 
 
 
 
ef37c27
ede2957
 
 
 
 
 
ef37c27
ede2957
 
 
 
b1a6dfa
de6d203
048e2e2
03d6e86
 
ede2957
 
91e5f2f
 
 
 
bafee93
57c5821
b1a6dfa
 
5f82549
57c5821
caea1f5
 
5f82549
 
57c5821
b1a6dfa
bafee93
91e5f2f
b1a6dfa
91e5f2f
 
03d6e86
 
 
 
 
57c5821
03d6e86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9adca6f
03d6e86
 
 
 
 
 
 
 
 
 
 
 
 
 
57c5821
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import streamlit as st
import pandas as pd
import torch
from transformers import pipeline
import datetime
from rapidfuzz import process, fuzz
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load the CSV file
df = pd.read_csv("anomalies.csv", quotechar='"')

# Filter 'real' higher than 10 Million
df= df[df['real'] >= 1000000.]

# Convert 'real' column to standard float format and then to strings
df['real'] = df['real'].apply(lambda x: f"{x:.2f}")

# Fill NaN values and convert all columns to strings
df = df.fillna('').astype(str)

print(df)

# Function to remove stopwords
def remove_stopwords(text, stopwords=ENGLISH_STOP_WORDS):
    return ' '.join([word for word in text.split() if word.lower() not in stopwords])

# Function to filter DataFrame by checking if any of the user question words are in the columns
def filter_dataframe(df, user_question, threshold=80):
    user_question = remove_stopwords(user_question)  # Remove stopwords
    question_words = user_question.split()

    mask = pd.Series([False] * len(df), index=df.index)
    
    for column in df.columns:
        for word in question_words:
            # Apply RapidFuzz fuzzy matching on the column
            matches = process.extract(word, df[column], scorer=fuzz.token_sort_ratio, limit=None)
            match_indices = [match[2] for match in matches if match[1] >= threshold]
            mask.loc[match_indices] = True  # Ensure the mask is aligned with the DataFrame index

    filtered_df = df[mask]
    
    return filtered_df

# Function to generate a response using the TAPAS model
def response(user_question, df):
    a = datetime.datetime.now()

    # Filter the DataFrame dynamically by user question
    subset_df = filter_dataframe(df, user_question)

    # Check if the DataFrame is empty
    if subset_df.empty:
        return {"Resposta": "Desculpe, não há dados disponíveis para responder à sua pergunta."}

    # Initialize the TAPAS model
    tqa = pipeline(task="table-question-answering", model="google/tapas-large-finetuned-wtq", 
                   tokenizer_kwargs={"clean_up_tokenization_spaces": False})

    # Debugging information
    print("Filtered DataFrame shape:", subset_df.shape)
    print("Filtered DataFrame head:\n", subset_df.head())
    print("User question:", user_question)

    # Query the TAPAS model
    try:
        answer = tqa(table=subset_df, query=user_question)['answer']
    except ValueError as e:
        print(f"Error: {e}")
        answer = "Desculpe, ocorreu um erro ao processar sua pergunta."

    query_result = {
        "Resposta": answer
    }

    b = datetime.datetime.now()
    print("Time taken:", b - a)

    return query_result

# Streamlit interface
st.markdown("""
<div style='display: flex; align-items: center;'>
    <div style='width: 40px; height: 40px; background-color: green; border-radius: 50%; margin-right: 5px;'></div>
    <div style='width: 40px; height: 40px; background-color: red; border-radius: 50%; margin-right: 5px;'></div>
    <div style='width: 40px; height: 40px; background-color: yellow; border-radius: 50%; margin-right: 5px;'></div>
    <span style='font-size: 40px; font-weight: bold;'>Chatbot do Tesouro RS</span>
</div>
""", unsafe_allow_html=True)

# Chat history
if 'history' not in st.session_state:
    st.session_state['history'] = []

# Input box for user question
user_question = st.text_input("Escreva sua questão aqui:", "")

if user_question:
    # Add human emoji when user asks a question
    st.session_state['history'].append(('👤', user_question))
    st.markdown(f"**👤 {user_question}**")
    
    # Generate the response
    bot_response = response(user_question, df)["Resposta"]
    
    # Add robot emoji when generating response and align to the right
    st.session_state['history'].append(('🤖', bot_response))
    st.markdown(f"<div style='text-align: right'>**🤖 {bot_response}**</div>", unsafe_allow_html=True)

# Clear history button
if st.button("Limpar"):
    st.session_state['history'] = []

# Display chat history
for sender, message in st.session_state['history']:
    if sender == '👤':
        st.markdown(f"**👤 {message}**")
    elif sender == '🤖':
        st.markdown(f"<div style='text-align: right'>**🤖 {message}**</div>", unsafe_allow_html=True)