fschwartzer commited on
Commit
ede2957
1 Parent(s): 91e5f2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -16
app.py CHANGED
@@ -4,6 +4,7 @@ import torch
4
  from transformers import pipeline
5
  import datetime
6
  from rapidfuzz import process, fuzz
 
7
 
8
  # Load the CSV file
9
  df = pd.read_csv("anomalies.csv", quotechar='"')
@@ -14,29 +15,39 @@ df['real'] = df['real'].apply(lambda x: f"{x:.2f}")
14
  # Fill NaN values and convert all columns to strings
15
  df = df.fillna('').astype(str)
16
 
17
- # Function to filter the DataFrame using RapidFuzz
18
- def filter_dataframe(df, date_str, group_keyword, threshold=80):
19
- # Apply fuzzy matching on the 'ds' (date) and 'Group' columns
20
- date_matches = process.extract(date_str, df['ds'], scorer=fuzz.token_sort_ratio, limit=None)
21
- group_matches = process.extract(group_keyword, df['Group'], scorer=fuzz.token_sort_ratio, limit=None)
22
 
23
- # Get the indices that match both criteria
24
- date_indices = {match[2] for match in date_matches if match[1] >= threshold}
25
- group_indices = {match[2] for match in group_matches if match[1] >= threshold}
26
- common_indices = list(date_indices & group_indices)
27
 
28
- return df.iloc[common_indices]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Function to generate a response using the TAPAS model
31
  def response(user_question, df):
32
  a = datetime.datetime.now()
33
 
34
- # Extract date and group keywords from the user question
35
- date_str = "December 2022" # Example; you'd extract this from the user question dynamically
36
- group_keyword = "IPVA"
37
-
38
- # Filter the DataFrame by date and group
39
- subset_df = filter_dataframe(df, date_str, group_keyword)
40
 
41
  # Check if the DataFrame is empty
42
  if subset_df.empty:
 
4
  from transformers import pipeline
5
  import datetime
6
  from rapidfuzz import process, fuzz
7
+ from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
8
 
9
  # Load the CSV file
10
  df = pd.read_csv("anomalies.csv", quotechar='"')
 
15
  # Fill NaN values and convert all columns to strings
16
  df = df.fillna('').astype(str)
17
 
18
+ # Filter 'real' higher than 10 Million
19
+ df= df[df['real'] >= 1000000.]
 
 
 
20
 
21
+ print(df)
 
 
 
22
 
23
+ # Function to remove stopwords
24
+ def remove_stopwords(text, stopwords=ENGLISH_STOP_WORDS):
25
+ return ' '.join([word for word in text.split() if word.lower() not in stopwords])
26
+
27
+ # Function to filter DataFrame by checking if any of the user question words are in the columns
28
+ def filter_dataframe(df, user_question, threshold=80):
29
+ user_question = remove_stopwords(user_question) # Remove stopwords
30
+ question_words = user_question.split()
31
+
32
+ mask = pd.Series([False] * len(df))
33
+
34
+ for column in df.columns:
35
+ for word in question_words:
36
+ # Apply RapidFuzz fuzzy matching on the column
37
+ matches = process.extract(word, df[column], scorer=fuzz.token_sort_ratio, limit=None)
38
+ match_indices = [match[2] for match in matches if match[1] >= threshold]
39
+ mask.iloc[match_indices] = True
40
+
41
+ filtered_df = df[mask]
42
+
43
+ return filtered_df
44
 
45
  # Function to generate a response using the TAPAS model
46
  def response(user_question, df):
47
  a = datetime.datetime.now()
48
 
49
+ # Filter the DataFrame dynamically by user question
50
+ subset_df = filter_dataframe(df, user_question)
 
 
 
 
51
 
52
  # Check if the DataFrame is empty
53
  if subset_df.empty: