pojitha commited on
Commit
aeed17f
1 Parent(s): cc35d81

Upload 7 files

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ sinhala-hate-speech-dataset filter=lfs diff=lfs merge=lfs -text
Sinhala_Singlish_Hate_Speech.csv ADDED
The diff for this file is too large to render. See raw diff
 
StopWords_425.txt ADDED
Binary file (9.2 kB). View file
 
Suffixes-413.txt ADDED
Binary file (5.32 kB). View file
 
app.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy
3
+ from sklearn.pipeline import Pipeline
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.svm import SVC
6
+ from sklearn.metrics import accuracy_score
7
+ import pandas as pd
8
+ import numpy as np
9
+ import streamlit as st
10
+
11
+
12
+ df1 = pd.read_csv('sinhala-hate-speech-dataset.csv')
13
+ df2 = pd.read_csv('Sinhala_Singlish_Hate_Speech.csv')
14
+
15
+ df2.columns= ["id","comment","label"]
16
+
17
+ df2['label'] = df2['label'].apply(lambda x: 1 if x == "YES" else 0)
18
+
19
+ df = pd.concat([df1, df2], sort=False)
20
+
21
+
22
+
23
+ df.isnull().sum()
24
+
25
+ import re
26
+
27
+ exclude = set(",.:;'\"-?!/´`%")
28
+ def remove_punctutation(text):
29
+ return ''.join([(i if i not in exclude else " ") for i in text])
30
+
31
+ def remove_numbers(text):
32
+ return ''.join(c for c in text if not c.isnumeric())
33
+
34
+ df['clean_data'] = df['comment'].apply(lambda x: remove_punctutation((x)))
35
+
36
+ df['cleand'] = df['clean_data'].apply(lambda x: remove_numbers(x))
37
+
38
+ import nltk
39
+ from nltk.tokenize import word_tokenize
40
+ nltk.download('punkt')
41
+
42
+ df['tokens'] = df['cleand'].apply(word_tokenize)
43
+
44
+ with open("StopWords_425.txt", "r",encoding="utf-16") as file:
45
+ # Read the contents of the file
46
+ contents = file.read()
47
+ stop_word = contents.split()
48
+ stop_word = [word for word in stop_word if not any(char.isdigit() for char in word)]
49
+ print(stop_word)
50
+
51
+ df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stop_word])
52
+
53
+ import nltk
54
+ from nltk.tokenize import word_tokenize
55
+
56
+ with open('Suffixes-413.txt', 'r', encoding='utf-16') as f:
57
+ stemmed_words = f.readlines()
58
+
59
+ stemmed_words = [word for word in stemmed_words if not any(char.isdigit() for char in word)]
60
+ stemmed_words = [word.strip() for word in stemmed_words]
61
+ stemmed_words = set(stemmed_words)
62
+
63
+ def stem_word(word):
64
+ if word in stemmed_words:
65
+ return word
66
+ else:
67
+ return nltk.stem.PorterStemmer().stem(word)
68
+
69
+ df['cleaneddata'] = df['tokens'].apply(lambda x: [stem_word(word) for word in x])
70
+
71
+
72
+ pipeline = Pipeline([
73
+ ('tfidf', TfidfVectorizer(stop_words=stop_word, token_pattern=r'\b\w+\b')),
74
+ ('svm', SVC())
75
+ ])
76
+
77
+ from sklearn.model_selection import train_test_split
78
+
79
+ X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
80
+
81
+ pipeline.fit(X_train, y_train)
82
+
83
+
84
+
85
+ st.title("Sinhala Hate Speech Detector")
86
+
87
+ # Define the user input section
88
+ user_input = st.text_input("Enter a sentence")
89
+
90
+ # Define the model output section
91
+ if user_input:
92
+ # Check if the sentence is hate or not
93
+ user_pred = pipeline.predict([user_input])[0]
94
+ if user_pred == 1:
95
+ st.write("This sentence is hate.")
96
+ add_to_df = st.selectbox("Is this correct?", ["Choose a Option","Yes", "No"],index=0)
97
+ if add_to_df == "Yes":
98
+ st.write("Thank you")
99
+ else:
100
+ processed_text = pd.Series(user_input)
101
+ df = df.append({'comment': user_input, 'label': 0}, ignore_index=True)
102
+ df.to_csv("sinhala-hate-speech-dataset", index=False)
103
+ X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
104
+ X_train = X_train.append(processed_text, ignore_index=True)
105
+ y_train = y_train.append(pd.Series([0]))
106
+ pipeline.fit(X_train, y_train)
107
+ st.write("Thank you for your contribution. We added that word into our system.")
108
+ else:
109
+ st.write("This sentence is not hate.")
110
+ add_to_df = st.selectbox("Is this correct?", ["Choose a Option","Yes", "No"],index=0)
111
+ if add_to_df == "Yes":
112
+ st.write("Thank you")
113
+ else:
114
+ processed_text = pd.Series(user_input)
115
+ df = df.append({'comment': user_input, 'label': 1}, ignore_index=True)
116
+ df.to_csv("sinhala-hate-speech-dataset.csv",index=True)
117
+ X_train, X_test, y_train, y_test = train_test_split(df['comment'], df['label'], test_size=0.3)
118
+ X_train = X_train.append(processed_text, ignore_index=True)
119
+ y_train = y_train.append(pd.Series([1]))
120
+ pipeline.fit(X_train, y_train)
121
+ st.write("Thank you for your contribution. We added that word into our system.")
requirements.txt ADDED
Binary file (41.5 kB). View file
 
sinhala-hate-speech-dataset ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372a6f64a4b68a8f5f820eac885dfa3526151acfab38dfb725d03f821de77c94
3
+ size 12901950
sinhala-hate-speech-dataset.csv ADDED
The diff for this file is too large to render. See raw diff