import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from IPython.utils.frame import extract_module_locals
from scipy.linalg import svd
from sklearn.datasets import fetch_lfw_people
from sklearn.utils import shuffle
from sklearn.decomposition import KernelPCA, PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# get correct path
try:
  WORKING_DIR = os.path.dirname(extract_module_locals()[1]['__session__'])
except:
  WORKING_DIR = os.getcwd()

lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.5)
n_samples_LFW, image_height, image_width = lfw_people.images.shape

X_LFW = lfw_people.data
Y_LFW = lfw_people.target

Y_names_LFW = lfw_people.target_names
n_classes_LFW = Y_names_LFW.shape[0]

print('Dataset properties:')
print('\t Number of data points: %d' % X_LFW.shape[0])
print('\t Number of features: %d' % X_LFW.shape[1])
print('\t Number of classes: %d' % n_classes_LFW)
print('\t Width of each image: %d' % image_width)
print('\t Height of each image: %d' % image_height)

Dataset properties:
	 Number of data points: 1288
	 Number of features: 2914
	 Number of classes: 7
	 Width of each image: 47
	 Height of each image: 62

def plot_faces(images, labels, names, n_row, n_col):
  """Helper function to plot a gallery of portraits"""
  f = plt.figure(figsize=(8, 3))
  for i in range(n_row * n_col):
    subfigure = f.add_subplot(n_row, n_col, i + 1)
    subfigure.imshow(images[i].reshape((image_height, image_width)), cmap=plt.cm.gray)
    subfigure.set_title(names[labels[i]], fontsize=10)
    # Removing the axes
    plt.xticks(())
    plt.yticks(())
  plt.show()


plot_faces(X_LFW, Y_LFW, Y_names_LFW, 2, 5)

# Parameters for both circles
radius1 = 2
radius2 = 10
num_points = 1500
noise_level = 0.2

# Function to generate noisy points on a circle with a label
def generate_noisy_circle_points_with_labels(radius: int, num_points: int, noise_level: float, label: int):
  # Random angles for polar coordinates
  theta = np.random.uniform(0, 2 * np.pi, num_points)
  # Coordinates on the circle (without noise)
  coord_x = radius * np.cos(theta)
  coord_y = radius * np.sin(theta)
  # Add noise
  coord_x += np.random.normal(0, noise_level, num_points)
  coord_y += np.random.normal(0, noise_level, num_points)
  # Create labels
  labels = np.full(num_points, label)
  return coord_x, coord_y, labels


if not os.path.exists(SYNTHETIC_FILE := os.path.join(WORKING_DIR, 'synthetic_circle_data.csv')):
  # Generate points for both circles
  coord_x1, coord_y1, labels1 = generate_noisy_circle_points_with_labels(radius1, num_points, noise_level, 1)
  coord_x2, coord_y2, labels2 = generate_noisy_circle_points_with_labels(radius2, num_points, noise_level, 2)

  # Combine the two circles
  coord_x = np.concatenate([coord_x1, coord_x2])
  coord_y = np.concatenate([coord_y1, coord_y2])
  labels = np.concatenate([labels1, labels2])

  # Create DataFrame for easy manipulation and shuffle
  data = pd.DataFrame({'coord_x': coord_x, 'coord_y': coord_y, 'label': labels})
  data = shuffle(data).reset_index(drop=True)

  # Save the shuffled data to a CSV file
  data.to_csv(SYNTHETIC_FILE, index=False)

# Load the dataset
data = pd.read_csv(SYNTHETIC_FILE)

# Split features and labels
X_TNC = data[['coord_x', 'coord_y']].values
Y_TNC = data['label'].values

# Plot the points, color-coded by label (1 -> red, 2 -> blue)
plt.figure(figsize=(6, 6))
plt.scatter(data['coord_x'], data['coord_y'], c=data['label'], cmap='bwr', s=5)

# Set equal aspect ratio to make the circles look accurate
plt.gca().set_aspect('equal', adjustable='box')

# Set labels and title
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Synthetic Samples of Two Noisy Circles')

plt.show()

class_name_bush = 'George W Bush'
class_indx_bush = list(Y_names_LFW).index(class_name_bush)
X_bush = X_LFW[Y_LFW == class_indx_bush, :]
Y_bush = Y_LFW[Y_LFW == class_indx_bush]
plot_faces(X_bush, Y_bush, Y_names_LFW, 2, 5)

print(X_bush.shape)

(530, 2914)

def centeralize_data(data):
  return data - (data_mean := np.mean(data, axis=0).reshape(1, -1)), data_mean


def pca_components(Vt, n_components):
  return Vt[:n_components]


def normalized_svd(data):
  XN, data_mean = centeralize_data(data)
  return *svd(XN), data_mean


def average_image_class(images):
  return np.mean(images, axis=0)


def eigen_face_class(images):
  return pca_components(normalized_svd(images)[2], n_components=1)


def plot_class_representatives(images, class_name, aggregator):
  f = plt.figure(figsize=(2, 2))
  subfigure = f.add_subplot(1, 1, 1)
  subfigure.imshow(aggregator(images).reshape((image_height, image_width)), cmap=plt.cm.gray)
  subfigure.set_title(class_name)
  # Removing the axes
  plt.xticks(())
  plt.yticks(())
  plt.show()


plot_class_representatives(X_bush, class_name_bush, aggregator=average_image_class)
plot_class_representatives(X_bush, class_name_bush, aggregator=eigen_face_class)

def pca_transform(X, n_components):
  """
  Transforms the data into a new space using PCA via SVD.

  Parameters:
  - X: ndarray
      The input data matrix to be transformed.
  - n_components: int
      The number of principal components to retain.

  Returns:
  - transformed_data: ndarray
      The data projected onto the top n_components principal components.
  - Vt: ndarray
      The right singular vectors from SVD, needed for inverse transformation.
  - data_mean: ndarray
      The mean of the original data, required to reconstruct the data.
  """
  U, s, *result = normalized_svd(X)
  return U[:, :n_components] * s[:n_components], *result

n_components = 100

# Transform the X_bush data using the specified number of principal components
T_bush, Vt_bush, mean_bush = pca_transform(X_bush, n_components)


n_components = 2

# Transform the X_TNC data using the specified number of principal components
T_TNC, Vt_TNC, mean_TNC = pca_transform(X_TNC, n_components)

def pca_inverse_transform(transformed_data, Vt, n_components, data_mean):
  """
  Reconstructs the original data from its transformed representation.

  Parameters:
  - transformed_data: ndarray
      The data in the reduced-dimensional space.
  - Vt: ndarray
      The right singular vectors from SVD, used to reconstruct the original space.
  - n_components: int
      The number of principal components that were used for the transformation.
  - data_mean: ndarray
      The mean of the original data (used for centering).

  Returns:
  - reconstructed_data: ndarray
      The data reconstructed back into the original space.
  """
  return transformed_data @ pca_components(Vt, n_components) + data_mean

n_components = 100

# Reconstruct the X_bush data back to the original image space.
reconstructed_bush = pca_inverse_transform(T_bush, Vt_bush, n_components, mean_bush)


n_components = 2

# Reconstruct the X_TNC data back to the original 2D space.
reconstructed_TNC = pca_inverse_transform(T_TNC, Vt_TNC, n_components, mean_TNC)

# Plotting the results
fig, axs = plt.subplots(1, 3, figsize=(12, 4))

# Plotting Original Data for X_TNC
axs[0].scatter(X_TNC[:, 0], X_TNC[:, 1], c=Y_TNC, cmap='bwr', s=5)
axs[0].set_title('Original X_TNC')

# Plotting Transformed Data for X_TNC
axs[1].scatter(T_TNC[:, 0], T_TNC[:, 1], c=Y_TNC, cmap='bwr', s=5)
axs[1].set_title('Transformed X_TNC')

# Plotting Reconstructed Data for X_TNC
axs[2].scatter(reconstructed_TNC[:, 0], reconstructed_TNC[:, 1], c=Y_TNC, cmap='bwr', s=5)
axs[2].set_title('Reconstructed X_TNC')

plt.tight_layout()
plt.show()

# Extract the first two principal components for plotting
pc1 = T_bush[:, 0]  # First principal component (PC1)
pc2 = T_bush[:, 1]  # Second principal component (PC2)

# Plotting the first two principal components
plt.figure(figsize=(5, 4))
plt.scatter(pc1, pc2, alpha=0.7, edgecolors='b')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('Projection onto the First Two Principal Components')
plt.grid(True)
plt.show()

index = 0  # Choose an index of the image you want to show (e.g., the first image)


original_image = X_bush[index].reshape(image_height, image_width)
reconstructed_image = reconstructed_bush[index].reshape(image_height, image_width)

# Plotting the results
fig, axs = plt.subplots(1, 2, figsize=(10, 5))

# Plotting Original Image from X_bush
axs[0].imshow(original_image, cmap='gray')
axs[0].set_title('Original Image (X_bush)')


# Plotting Reconstructed Image from X_bush
axs[1].imshow(reconstructed_image, cmap='gray')
axs[1].set_title('Reconstructed Image (X_bush)')

plt.tight_layout()
plt.show()

# Define the number of components to test in PCA
c_components = [2, 10, 30, 60, 100]

# Initialize lists to store the reconstruction errors for training and testing data
train_errors, test_errors = [], []

# Initialize deterministic seed
SEED = 42

X_train, X_test = train_test_split(X_bush, train_size=400, random_state=SEED)


# \text{error}=\frac{1}{n}\sum_{i=1}^n||x_i-\text{reconstruct}(pca(x_i))||^2_2
def mse(train_data, reconstructed):
  return np.mean(np.sum((train_data - reconstructed) ** 2, axis=1))


# Loop through each specified number of components for PCA
for n_components in c_components:
  # Apply PCA and then inverse PCA to the training data
  transformed_train, Vt_train, mean_train = pca_transform(X_train, n_components)

  # Calculate the Mean Squared Error (MSE) as the reconstruction error for the training set
  train_errors.append(mse(X_train, pca_inverse_transform(transformed_train, Vt_train, n_components, mean_train)))
  # Normalize the test data. Transform the test data using the train data's PCA components # and reconstruct the test data.
  # Calculate the Mean Squared Error (MSE) as the reconstruction error for the test set
  test_errors.append(
    mse(
      X_test,
      pca_inverse_transform(
        (X_test - mean_train) @ pca_components(Vt_train, n_components).T, Vt_train, n_components, mean_train
      ),
    )
  )

# Print the average reconstruction errors for each number of components
for i, n_components in enumerate(c_components):
  print(f'Components: {n_components}\n  Train Error: {train_errors[i]:.4f}\n  Test Error: {test_errors[i]:.4f}')

Components: 2
  Train Error: 40.2048
  Test Error: 44.1277
Components: 10
  Train Error: 21.6275
  Test Error: 25.1425
Components: 30
  Train Error: 11.6392
  Test Error: 15.6092
Components: 60
  Train Error: 6.6892
  Test Error: 11.4092
Components: 100
  Train Error: 3.7635
  Test Error: 8.7075

def plot_eval_results(c_components, train_error, test_error):
  plt.figure(figsize=(6 * 1.5, 5 * 1.2))
  plt.plot(c_components, train_error, 'b-o', label='train error')
  plt.plot(c_components, test_error, 'r-o', label='test error')
  plt.xlabel('n_components')
  plt.ylabel('loss')
  plt.legend()
  plt.grid(True)
  plt.show()


plot_eval_results(c_components, train_errors, test_errors)

# Define the number of components to test in PCA
n_components = 2

TNC_train, TNC_test = train_test_split(X_TNC, train_size=2000, random_state=SEED)
transformed_TNC_train, Vt_TNC_train, mean_TNC_train = pca_transform(TNC_train, n_components)
train_error = mse(TNC_train, pca_inverse_transform(transformed_TNC_train, Vt_TNC_train, n_components, mean_TNC_train))
test_error = mse(
  TNC_test,
  pca_inverse_transform(
    (TNC_test - mean_TNC_train) @ pca_components(Vt_TNC_train, n_components).T,
    Vt_TNC_train,
    n_components,
    mean_TNC_train,
  ),
)

# Print the average reconstruction errors for each number of components
print(f'Average reconstruction error for train data is {train_error}')
print(f'Average reconstruction error for test data is {test_error}')

Average reconstruction error for train data is 6.217495281310197e-30
Average reconstruction error for test data is 1.2387388672271008e-30

  KernelPCA(kernel='rbf', n_components=100, gamma=0.1, fit_inverse_transform=True)

# Load the dataset
data = pd.read_csv(SYNTHETIC_FILE)

# Split features and labels
X_TNC = data[['coord_x', 'coord_y']].values
Y_TNC = data['label'].values

# Define different values of gamma to test
gamma_values = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1]
n_components = 2

# Standardize the features
scaler = StandardScaler()
X_TNC_scaled = scaler.fit_transform(X_TNC)

# Create subplots to visualize the transformed data for each gamma
plt.figure(figsize=(20, 15))

# Plot the original data before applying Kernel PCA
plt.subplot(3, 4, 1)
plt.scatter(X_TNC_scaled[:, 0], X_TNC_scaled[:, 1], c=Y_TNC, cmap='bwr')
plt.title('Original Data')
plt.xlabel('coord_x')
plt.ylabel('coord_y')

# Set the limits for the x and y axes
x_limits, y_limits = (-4, 4), (-4, 4)

# Apply Kernel PCA for each gamma value
for idx, gamma in enumerate(gamma_values):
  # Apply Kernel PCA
  kpca = KernelPCA(n_components=n_components, kernel='rbf', gamma=gamma)
  X_kpca = kpca.fit_transform(X_TNC_scaled)

  # Plot the transformed data
  plt.subplot(3, 4, idx + 2)
  plt.scatter(X_kpca[:, 0], X_kpca[:, 1], c=Y_TNC, cmap='viridis')
  plt.title(f'Gamma = {gamma}')
  plt.xlabel('First principal component')
  plt.ylabel('Second principal component')
  # Set fixed x and y axis limits
  plt.xlim(x_limits)
  plt.ylim(y_limits)

plt.tight_layout()
plt.show();

# Define the hyperparameter ranges
gamma_values = [0.005, 0.01, 0.05, 0.1, 0.5]
n_components_values = [2, 10, 30, 60, 100]

X_train, X_test = train_test_split(X_bush, train_size=400, random_state=SEED)

train_errors = {gamma: [] for gamma in gamma_values}
test_errors = {gamma: [] for gamma in gamma_values}

# Loop through each combination of gamma and n_components
for gamma in gamma_values:
  for n_components in n_components_values:
    # Fit KernelPCA
    kpca = KernelPCA(n_components=n_components, kernel='rbf', gamma=gamma, fit_inverse_transform=True)
    X_train_kpca = kpca.fit_transform(X_train)
    X_test_kpca = kpca.transform(X_test)

    # Reconstruct the data
    X_train_reconstructed = kpca.inverse_transform(X_train_kpca)
    X_test_reconstructed = kpca.inverse_transform(X_test_kpca)
    train_errors[gamma].append(mse(X_train, X_train_reconstructed))
    test_errors[gamma].append(mse(X_test, X_test_reconstructed))

train_errors_df = pd.DataFrame(train_errors, index=n_components_values)
test_errors_df = pd.DataFrame(test_errors, index=n_components_values)

# Display the results as tables
print('Reconstruction Error for Training Set:')
print(train_errors_df)
print('\nReconstruction Error for Testing Set:')
print(test_errors_df)

# Find the best combination
best_n_components, best_gamma = np.unravel_index(test_errors_df.values.argmin(), test_errors_df.shape)
best_error = test_errors_df.values[best_n_components, best_gamma]

print(
  f'Best combination:\n  Gamma: {gamma_values[best_gamma]} - n_components: {n_components_values[best_n_components]}\n  test error: {best_error}'
)

Reconstruction Error for Training Set:
         0.005      0.010      0.050      0.100      0.500
2    56.035629  52.460106  59.473942  62.702892  62.768883
10   54.013870  48.537128  54.149078  59.908009  62.227745
30   53.707760  47.722954  52.062382  57.340527  61.118011
60   53.642696  47.483402  50.792248  54.513866  57.952980
100  53.620888  47.394039  49.331505  52.528706  51.324707

Reconstruction Error for Testing Set:
         0.005      0.010      0.050      0.100      0.500
2    60.903149  57.393143  65.655075  68.015724  68.021919
10   58.914120  53.705605  62.367989  67.361328  67.999069
30   58.638409  53.019703  61.464310  67.065262  68.198509
60   58.598793  52.895687  61.163082  66.958488  69.903511
100  58.587997  52.866302  61.069897  67.134178  78.636238
Best combination:
  Gamma: 0.01 - n_components: 100
  test error: 52.866302490234375

APPLY_SCALED = False
USE_SKLEARN_PCA = True

# Choose n_components = 60 for visualization
n_components = 60

scaler = StandardScaler()

# Initialize lists to store errors
train_errors_kpca = []
test_errors_kpca = []
train_error_pca = []
test_error_pca = []

X_train, X_test = train_test_split(X_bush, train_size=400, random_state=SEED)

if APPLY_SCALED:
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
else:
  X_train_scaled = X_train
  X_test_scaled = X_test

# Compute errors for Kernel PCA
for gamma in gamma_values:
  kpca = KernelPCA(n_components=n_components, kernel='rbf', gamma=gamma, fit_inverse_transform=True)

  # Train set
  X_train_kpca = kpca.fit_transform(X_train_scaled)
  X_train_reconstructed = kpca.inverse_transform(X_train_kpca)
  train_errors_kpca.append(mse(X_train_scaled, X_train_reconstructed))

  # Test set
  X_test_kpca = kpca.transform(X_test_scaled)
  X_test_reconstructed = kpca.inverse_transform(X_test_kpca)
  test_errors_kpca.append(mse(X_test_scaled, X_test_reconstructed))

# Linear PCA
if not USE_SKLEARN_PCA:
  transformed_bush_train, Vt_bush_train, mean_bush_train = pca_transform(X_train_scaled, n_components)
  train_error_pca.append(
    mse(X_train, pca_inverse_transform(transformed_bush_train, Vt_train, n_components, mean_train))
  )
  test_error_pca.append(
    mse(
      X_test_scaled,
      pca_inverse_transform(
        (X_test_scaled - mean_train) @ pca_components(Vt_train, n_components).T, Vt_train, n_components, mean_train
      ),
    )
  )
else:
  # sklearn PCA
  pca = PCA(n_components=n_components)
  X_train_pca = pca.fit_transform(X_train_scaled)
  X_train_reconstructed_pca = pca.inverse_transform(X_train_pca)
  train_error_pca.append(mse(X_train, X_train_reconstructed_pca))

  X_test_pca = pca.transform(X_test_scaled)
  X_test_reconstructed_pca = pca.inverse_transform(X_test_pca)
  test_error_pca.append(mse(X_test, X_test_reconstructed_pca))


# Plot the results
plt.figure(figsize=(12, 5))

# Plot for training set
plt.subplot(1, 2, 1)
plt.plot(gamma_values, train_errors_kpca, marker='o', label='Kernel PCA')
plt.axhline(y=train_error_pca, color='r', linestyle='--', label='Linear PCA')
plt.xlabel('Gamma Values')
plt.ylabel('Reconstruction Error')
plt.title(f'Training Set (n_components = {n_components})')
plt.legend()
plt.xscale('log')
plt.grid(True)

# Plot for test set
plt.subplot(1, 2, 2)
plt.plot(gamma_values, test_errors_kpca, marker='o', label='Kernel PCA')
plt.axhline(y=test_error_pca, color='r', linestyle='--', label='Linear PCA')
plt.xlabel('Gamma Values')
plt.ylabel('Reconstruction Error')
plt.title(f'Test Set (n_components = {n_components})')
plt.legend()
plt.xscale('log')
plt.grid(True)


plt.tight_layout()
plt.show()

print(f'Linear PCA - Train Error: {train_error_pca[0]:.4f}, Test Error: {test_error_pca[0]:.4f}')
print('Kernel PCA Errors:')
for gamma, train_err, test_err in zip(gamma_values, train_errors_kpca, test_errors_kpca):
  print(f'  Gamma {gamma:.3f} - Train Error: {train_err:.4f}, Test Error: {test_err:.4f}')

Linear PCA - Train Error: 6.7114, Test Error: 11.4403
Kernel PCA Errors:
  Gamma 0.005 - Train Error: 53.6427, Test Error: 58.5988
  Gamma 0.010 - Train Error: 47.4834, Test Error: 52.8957
  Gamma 0.050 - Train Error: 50.7922, Test Error: 61.1631
  Gamma 0.100 - Train Error: 54.5139, Test Error: 66.9585
  Gamma 0.500 - Train Error: 57.9530, Test Error: 69.9035

Practicing PCA on Two Datasets¶

Submission¶

Packages¶

Datasets:¶

First Dataset: Labeled Faces in the Wild (LFW)¶

Features:¶

Dataset Properties:¶

Loading LFW dataset:¶

Visualization:¶

Second Dataset: Two Noisy Circles (TNC)¶

Features:¶

Data Description:¶

Dataset Properties:¶

Generating TNC Dataset:¶

Loading TNC dataset:¶

Visualization:¶

Task 1: Eigenfaces (5 points) ¶

Select "George W Bush" class¶

Implementation¶

Task 2: PCA Transformation and Reconstructing (15 points) ¶

Part A (10 points): ¶

Inputs:¶

Outputs:¶

We apply PCA transform to both datasets:¶

Part B (5 points): ¶

We apply PCA inverse transform to both new datasets:¶

TNC Visualization:¶

LFW Visualization:¶

Task 3: Average Reconstruction Error for LFW (30 points)¶

Part A (20 points): ¶

Part B (10 points): ¶

Task 4: Kernel PCA (30 points)¶

Part A (10 points): ¶

Apply Kernel PCA and Plot Transformed Data:¶

Part B (5 points): ¶

Part C (5 points): ¶

Find the Best Values for Reconstruction Error of Kernel PCA¶

Part D (10 points): ¶

Visualization of Reconstruction Error (5 points)¶

Explanation (5 points)¶