import joblib
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
import re

# Load the model and vectorizer
model = joblib.load("hard_voting_classifier.pkl")
vectorizer = joblib.load("vectorizer.pkl")

# Load custom stopwords
with open("Indonesia_stopwords.txt", "r") as f:
    custom_stopwords = [word.strip() for word in f.readlines()]

def preprocess_data(text):
    """Preprocess the input text."""
    # Case Folding
    text = text.lower()
    
    # Sentence Normalization
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword Removal and Stemming
    stemmer = StemmerFactory().create_stemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in custom_stopwords]
    
    return ' '.join(tokens)

def predict_sentiment(text):
    """Predict the sentiment of the input text."""
    preprocessed_text = preprocess_data(text)
    vectorized_text = vectorizer.transform([preprocessed_text])
    prediction = model.predict(vectorized_text)
    return "Positive" if prediction[0] == 1 else "Negative"