### Data Preprocessing

In [None]:
!pip3 install opencv-python

Collecting opencv-python
 Downloading opencv-python-4.10.0.84.tar.gz (95.1 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h Installing build dependencies ... [?25ldone
[?25h Getting requirements to build wheel ... [?25ldone
[?25h Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: opencv-python
 Building wheel for opencv-python (pyproject.toml) ... [?25l/

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from scipy.spatial import distance
from collections import Counter
import seaborn as sns
import joblib

In [None]:
# Evaluate classifiers
def evaluate_classifier(y_true, y_pred, classifier_name):
 acc = accuracy_score(y_true, y_pred)
 f1 = f1_score(y_true, y_pred)
 cm = confusion_matrix(y_true, y_pred)
 print(f"{classifier_name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
 print(f"Confusion Matrix:\n{cm}\n")

 plt.figure(figsize=(8, 6))
 sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Real Photo', 'CGI'], yticklabels=['Real Photo', 'CGI'])
 plt.title(f'Confusion Matrix for {classifier_name}')
 plt.xlabel('Predicted Labels')
 plt.ylabel('True Labels')
 plt.show()

In [None]:
import numpy as np
from PIL import Image
from scipy.fftpack import fft2
from tensorflow.keras.models import load_model, Model

# Function to apply Fourier transform
def apply_fourier_transform(image):
 image = np.array(image)
 fft_image = fft2(image)
 return np.abs(fft_image)

# Function to preprocess image
def preprocess_image(image_path):
 try:
 image = Image.open(image_path).convert('L')
 image = image.resize((256, 256))
 image = apply_fourier_transform(image)
 image = np.expand_dims(image, axis=-1) # Expand dimensions to match model input shape
 image = np.expand_dims(image, axis=0) # Expand to add batch dimension
 return image
 except Exception as e:
 print(f"Error processing image {image_path}: {e}")
 return None

# Function to load embedding model and calculate embeddings
def calculate_embeddings(image_path, model_path='embedding_modelv2.keras'):
 # Load the trained model
 model = load_model(model_path)

 # Remove the final classification layer to get embeddings
 embedding_model = Model(inputs=model.input, outputs=model.output)

 # Preprocess the image
 preprocessed_image = preprocess_image(image_path)

 # Calculate embeddings
 embeddings = embedding_model.predict(preprocessed_image)

 return embeddings


def calculate_embeddings_folder(folder_path, model_path='embedding_modelv2.keras'):
 embeddings = []
 labels = []
 for filename in os.listdir(folder_path):
 if filename.endswith(".jpg") or filename.endswith(".png"):
 image_path = os.path.join(folder_path, filename)
 embedding = calculate_embeddings(image_path, model_path)
 embeddings.append(embedding)
 if "CGI" in folder_path:
 labels.append(1)
 else:
 labels.append(0)
 return embeddings, labels

In [None]:
embeddings = np.load('embeddings.npy')
labels = np.load('labels.npy')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
X_test.shape

In [None]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', early_stopping_rounds=10)
xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
y_pred_xgb = xgb_clf.predict(X_test)
evaluate_classifier(y_test, y_pred_xgb, "XGBoost Classifier")

In [None]:
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC

In [None]:
# Naive random classifier
class RandomClassifier:
 def fit(self, X, y):
 pass

 def predict(self, X):
 return np.random.choice([0, 1], size=X.shape[0])

class MeanClassifier:
 def fit(self, X, y):
 self.mean_0 = np.mean(X[y == 0], axis=0) if np.any(y == 0) else None
 self.mean_1 = np.mean(X[y == 1], axis=0) if np.any(y == 1) else None

 def predict(self, X):
 preds = []
 for x in X:
 dist_0 = distance.euclidean(x, self.mean_0) if self.mean_0 is not None else np.inf
 dist_1 = distance.euclidean(x, self.mean_1) if self.mean_1 is not None else np.inf
 preds.append(1 if dist_1 < dist_0 else 0)
 return np.array(preds)

 def predict_proba(self, X):
 # An implementation of probability prediction which uses a softmax function to determine the probability of each class based on the distance to the mean for each prototype
 preds = []
 for x in X:
 dist_0 = distance.euclidean(x, self.mean_0) if self.mean_0 is not None else np
 dist_1 = distance.euclidean(x, self.mean_1) if self.mean_1 is not None else np.inf
 prob_0 = np.exp(-dist_0) / (np.exp(-dist_0) + np.exp(-dist_1))
 prob_1 = np.exp(-dist_1) / (np.exp(-dist_0) + np.exp(-dist_1))
 preds.append([prob_0, prob_1])
 return np.array(preds)

 def mean_distance(self, x):
 dist_mean_0 = distance.euclidean(x, self.mean_0) if self.mean_0 is not None else np.inf
 dist_mean_1 = distance.euclidean(x, self.mean_1) if self.mean_1 is not None else np.inf
 return dist_mean_0, dist_mean_1

# Initialize classifiers
random_clf = RandomClassifier()
mean_clf = MeanClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=10)
rf_clf = RandomForestClassifier(max_depth=10, random_state=42)
mlp_clf = MLP(hidden_layer_sizes=(128,), max_iter=1000, random_state=42)
svc_clf = SVC()

# Train classifiers
random_clf.fit(X_train, y_train)
mean_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)
#xgb_clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
rf_clf.fit(X_train, y_train)
mlp_clf.fit(X_train, y_train)
svc_clf.fit(X_train, y_train)

# Make predictions
y_pred_random = random_clf.predict(X_test)
y_pred_mean = mean_clf.predict(X_test)
y_pred_knn = knn_clf.predict(X_test)
#y_pred_xgb = xgb_clf.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)
y_pred_mlp = mlp_clf.predict(X_test)
y_pred_svc = svc_clf.predict(X_test)

In [None]:
evaluate_classifier(y_test, y_pred_random, "Random Classifier")
evaluate_classifier(y_test, y_pred_mean, "Mean Classifier")
evaluate_classifier(y_test, y_pred_knn, "KNN Classifier")

In [None]:
evaluate_classifier(y_test, y_pred_xgb, "XGBoost Classifier")
evaluate_classifier(y_test, y_pred_rf, "Random Forest Classifier")
evaluate_classifier(y_test, y_pred_svc, "SVC Classifier")

In [None]:
evaluate_classifier(y_test, y_pred_mlp, "MLP Classifier")

In [None]:
test_filename = "neytiri.png"

In [None]:
test_embeddings = calculate_embeddings(test_filename, model_path='embedding_modelv2.keras')

In [None]:
def print_prob(model, image_path):
 test_embeddings = calculate_embeddings(image_path, model_path='embedding_modelv2.keras')
 probs = model.predict_proba(test_embeddings)
 print(f"Real Photo Probability: {probs[0][0]:.4f}")
 print(f"CGI Probability: {probs[0][1]:.4f}")

In [None]:
print_prob(mlp_clf, test_filename)

In [None]:
print_prob(mean_clf, test_filename)

In [None]:
print_prob(xgb_clf, test_filename)

In [None]:
print_prob(rf_clf, test_filename)

In [None]:
print_prob(knn_clf, test_filename)

In [None]:
dist = np.round(mean_clf.mean_distance(test_embeddings[0]), 2)
print(f"Dist to real mean {dist[0]}")
print(f"Dist to CGI mean {dist[1]}")

In [None]:
def embedding_distance(image_path_1, image_path_2):
 embedding_1 = calculate_embeddings(image_path_1)
 embedding_2 = calculate_embeddings(image_path_2)
 distance = np.linalg.norm(embedding_1 - embedding_2)
 return distance

## Visualizing Feature Space

In [None]:
# prompt: How can I plot embeddings on a t-SNE scatter plot and colored by the label? A label of 1 should be "CGI" in the legend and 0 should be "Real Photo"

import matplotlib.pyplot as plt
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Plot the embeddings
plt.figure(figsize=(10, 7))
sns.scatterplot(
 x=embeddings_2d[:, 0],
 y=embeddings_2d[:, 1],
 hue=['CGI' if label == 1 else 'Real Photo' for label in labels], # Map labels to strings
 palette=sns.color_palette("hsv", 2),
 legend="full"
)
plt.title("t-SNE of Image Embeddings")
plt.xlabel("t-SNE component 1")
plt.ylabel("t-SNE component 2")
plt.show()

In [None]:
# prompt: Can you write a function that visualizes the embeddings using t-sne with the labels but allows a parameter which is an image path and preprocesses the image and calculates the embeddings and plots this embedding as well?

import matplotlib.pyplot as plt
import numpy as np
def visualize_embeddings_with_new_image(image_path, embeddings, labels):
 """
 Visualizes embeddings using t-SNE, including a new image's embedding.

 Args:
 image_path: Path to the new image.
 embeddings: Existing embeddings.
 labels: Corresponding labels for existing embeddings.
 """

 # Calculate embedding for the new image
 new_embedding = calculate_embeddings(image_path, model_path='embedding_modelv2.keras')

 # Append new embedding and label to existing data
 all_embeddings = np.concatenate((embeddings, new_embedding), axis=0)
 all_labels = np.concatenate((labels, [2]), axis=0) # Assuming 2 is a new label for the new image

 # Apply t-SNE
 tsne = TSNE(n_components=2, random_state=42)
 embeddings_2d = tsne.fit_transform(all_embeddings)

 # Plot the embeddings
 plt.figure(figsize=(10, 7))
 sns.scatterplot(
 x=embeddings_2d[:-1, 0], # Plot existing embeddings
 y=embeddings_2d[:-1, 1],
 hue=['CGI' if label == 1 else 'Real Photo' for label in all_labels[:-1]],
 palette=sns.color_palette("hsv", 2),
 legend="full"
 )

 # Plot the new image's embedding
 plt.scatter(
 x=embeddings_2d[-1, 0],
 y=embeddings_2d[-1, 1],
 color='black',
 marker='*',
 s=200,
 label='New Image'
 )

 plt.title("t-SNE of Image Embeddings with New Image")
 plt.xlabel("t-SNE component 1")
 plt.ylabel("t-SNE component 2")
 plt.legend()
 plt.show()

# Example usage:
# visualize_embeddings_with_new_image("path/to/your/new/image.jpg", embeddings, labels)


In [None]:
visualize_embeddings_with_new_image("neytiri.png", embeddings, labels)

### Testing Validation

In [None]:
!unzip Validation.zip

In [None]:
cgi_val_images, cgi_val_labels = calculate_embeddings_folder('Validation/CGI')
photo_val_images, photo_val_labels = calculate_embeddings_folder('Validation/Photo')

print(f"CGI shape {np.array(cgi_val_images).shape}")
print(f"Photo shape {np.array(photo_val_images).shape}")

In [None]:
# prompt: Can you test the validation images and labels against the XGB, Mean, and KNN classifiers?

import numpy as np
# Combine validation data
X_val = np.concatenate((cgi_val_images, photo_val_images), axis=0)
y_val = np.concatenate((cgi_val_labels, photo_val_labels), axis=0)

# Reshape validation data to match model input
X_val = X_val.reshape(X_val.shape[0], -1)

# Predict using classifiers
y_pred_xgb_val = xgb_clf.predict(X_val)
y_pred_mean_val = mean_clf.predict(X_val)
y_pred_knn_val = knn_clf.predict(X_val)
y_pred_svc_val = svc_clf.predict(X_val)
y_pred_rf_val = rf_clf.predict(X_val)
y_pred_mlp_val = mlp_clf.predict(X_val)

# Evaluate classifiers on validation set
evaluate_classifier(y_val, y_pred_xgb_val, "XGBoost Classifier (Validation)")
evaluate_classifier(y_val, y_pred_mean_val, "Mean Classifier (Validation)")
evaluate_classifier(y_val, y_pred_knn_val, "KNN Classifier (Validation)")
evaluate_classifier(y_val, y_pred_svc_val, "SVC Classifier (Validation)")
evaluate_classifier(y_val, y_pred_rf_val, "Random Forest Classifier (Validation)")


### Old Preprocessing

In [None]:
# Function to load and preprocess images
def load_images(folder, label):
 images = []
 labels = []
 for filename in os.listdir(folder):
 if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
 img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
 if img is not None:
 img = cv2.resize(img, (256, 256))
 images.append(img)
 labels.append(label)
 return images, labels

pca = PCA(n_components=128)
# Function to perform Fourier transform and extract features
def extract_features(images):
 features = []
 for img in images:
 f_transform = np.fft.fft2(img)
 f_shift = np.fft.fftshift(f_transform)
 magnitude_spectrum = 20 * np.log(np.abs(f_shift))
 features.append(magnitude_spectrum.flatten())
 features = pca.fit_transform(features)
 return np.array(features)

# Load and preprocess images from both folders
cgi_images, cgi_labels = load_images('CGI', 1) # 1 for CGI
photo_images, photo_labels = load_images('Photo', 0) # 0 for Real Photo

min_length = min(len(cgi_images), len(photo_images))
cgi_images = cgi_images[:min_length]
cgi_labels = cgi_labels[:min_length]
photo_images = photo_images[:min_length]
photo_labels = photo_labels[:min_length]

# Combine datasets
images = cgi_images + photo_images
labels = cgi_labels + photo_labels

print(f"Number of CGI images: {len(cgi_images)}")
print(f"Number of Photo images: {len(photo_images)}")

# Extract features
features = extract_features(images)

# Encode labels
labels = np.array(labels)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, stratify=labels)

In [None]:
X_train.shape

In [None]:
embeddings.shape

In [None]:
X_test.shape