# Exercise Session - Non Linear Dimensionality Reduction

We will explore **Autoencoder** for non linear dimensionality reduction. **Pytorch** will be used in this exercise, which can be reviewed in the previous exercises.

In [None]:
# import a few packages
%matplotlib inline
import numpy as np
from sklearn.datasets import fetch_olivetti_faces
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST
from torch.utils.data.sampler import SubsetRandomSampler
import os

# Convolutional Autoencoder
In the lecture, we learn about the concept of a non-linear encoder/decoder architecture, namely the convolutional autoencoder. An encoder $f_{e}$ first encodes the input $x$ into a feature $z$ and then this feature vector is decoded back to the input image by a decoder $f_{d}$, this process is defined as

$$
\begin{align*}
z = f_{e}(x) \\
\hat{x} = f_{d}(z)
\end{align*}
$$

The parameters in the autodecoder can be computed by minimizing the MSE loss:

$$
\begin{align*}
\sum_n \lVert \hat{x_n} - x_n \rVert ^2 \\
\end{align*}
$$


We will use the **Conv2d** and **ConvTranspose2d** in pytorch to define our convolutional autoencoder in this exercise.

## MNIST 

We first begin with a simple dataset -- MNIST which consists of gray images. In the training process, we visualize both the reconstructed image $\hat{x}$ and the grayscaled encoded feature $z$ every 5 epoches, you could find the output pictures of last training batch in the folder named "mnist_vis". 


In [None]:
# define the mnist autoencoder network
class autoencoder_mnist(nn.Module):
    '''
    Input size (1, 28, 28), then filling each layer according to its output size
    
    arg: d: int, the channel size of encoded feature
    
    '''
    def __init__(self, d=8):
        super(autoencoder_mnist, self).__init__()
        '''
        Define an encoder with 3 convolutional layers with kernel size of 3, padding=1, 
        each one followed by a ReLU layer and a MaxPooling layer.
        That is: 3 × (Conv2d --> ReLU --> MaxPool2d)
        
        Hint: use stride in pooling layers to reduce output size
        
        '''
        self.encoder = nn.Sequential(
            ### CODE HERE ###
            nn.Conv2d(..., stride=1, padding=1),  # (16, 28, 28)
            nn.ReLU(),
            nn.MaxPool2d(...),  # (16, 14, 14)
            nn.Conv2d(..., stride=1, padding=1),  # (8, 14, 14)
            nn.ReLU(),
            nn.MaxPool2d(...),  # (8, 7, 7)
            nn.Conv2d(..., stride=1, padding=1),  # (d, 7, 7)
            nn.ReLU(),
            nn.MaxPool2d(...) # (d, 3, 3)
        )
        '''
        Define a decoder with 3 transposed convolutional layers. The first two of them are followed by a ReLU layer.
        That is: 2 × (ConvTranspose2d --> ReLU ) --> ConvTranspose2d 
        
        Hint: use stride and proper kernel size to reconstruct the output size.
        
        '''
        self.decoder = nn.Sequential(
            ### CODE HERE ###
            nn.ConvTranspose2d(...),   # (8, 7, 7)
            nn.ReLU(),
            nn.ConvTranspose2d(...),  # (16, 14, 14)
            nn.ReLU(),
            nn.ConvTranspose2d(...)   # (1, 28, 28)
        )

    def forward(self, x):
        z = self.encoder(x)
        x = self.decoder(z)    
        return x,z

In [None]:
# define the folder to save the visualization pictures
if not os.path.exists('./mnist_vis'):
    os.mkdir('./mnist_vis/')
    os.mkdir('./mnist_vis/reconstruction/')
    os.mkdir('./mnist_vis/feature/')


# define the hyper-parameters
num_epochs = 50
batch_size = 64
learning_rate = 1e-3

# get the data
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5, ), (0.5, )),
                             ])


dataset = MNIST('mnist_data/', download=True, train=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=64,num_workers=4, sampler=SubsetRandomSampler(range(12800)))

# define a model and criterion
model = autoencoder_mnist()
### CODE HERE ###
criterion = ...

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

for epoch in range(num_epochs):
    # training
    for data in dataloader:
        img, _ = data
        # ===================forward=====================
        ### CODE HERE ###
        output,feature = ...
        loss = ...
        
        # ===================backward====================
        ### CODE HERE ###
        ...
        ...
        ...
        
    # log 
    print('Epoch [{}/{}], Training loss:{:.4f}'
          .format(epoch+1, num_epochs, loss))
    
    if epoch==0 or (epoch+1) % 5 == 0:
        pic_output = output        
        save_image(pic_output, './mnist_vis/reconstruction/output_{}.png'.format(epoch+1))       
        pic_feature = feature.view(feature.size(0), 1, 8, -1)
        save_image(pic_feature, './mnist_vis/feature/feature_{}.png'.format(epoch+1), normalize=True) 
        
        # visualization
        img1 = mpimg.imread('./mnist_vis/reconstruction/output_{}.png'.format(epoch+1))
        img2 = mpimg.imread('./mnist_vis/feature/feature_{}.png'.format(epoch+1))
        
        fig = plt.figure(figsize=(12, 6))
        plt.subplot(121).set_title('Reconstruction')
        plt.imshow(img1, aspect='auto')
        plt.axis('off')
        plt.subplot(122).set_title('Encoded Feature')
        plt.imshow(img2, aspect='auto')
        plt.axis('off')
        plt.show()
        
        
# save the trained network
torch.save(model.state_dict(), './conv_autoencoder_mnist.pth')
print('Finished.')

## Face 

In the previous exercise, we learn how to use PCA to generate the eigenfaces and reconstruct the face, here we show how to do the reconstruction with convolutional autoencoder.

We visualize the reconstructed image $\hat{x}$ and the grayscaled feature $z$ every 50 epoches in the folder named "face_vis". 

In [None]:
# define the face autoencoder network
class autoencoder_face(nn.Module):
    '''
    Input size is (1, 64, 64), then filling each layer according to its output size
    
    arg: d: int, channel size of encoded feature
    
    '''
    def __init__(self, d=8):
        super(autoencoder_face, self).__init__()
        '''
        Define an encoder with 3 convlotional layers with kernel size of 3, padding=1, 
        each one following by a ReLU layer and a MaxPooling layer.
        That is: 3 × (Conv2d --> ReLU --> MaxPool2d)
        
        Hint: use stride in pooling layers to reduce output size
        
        '''
        self.encoder = nn.Sequential(
            ### CODE HERE ###
            nn.Conv2d(..., stride=1, padding=1),  # (16, 64, 64)
            nn.ReLU(),
            nn.MaxPool2d(...),  #(16, 32, 32)
            nn.Conv2d(..., stride=1, padding=1),  # (32, 32, 32)
            nn.ReLU(),
            nn.MaxPool2d(...),  # (32, 16, 16)
            nn.Conv2d(..., stride=1, padding=1),  # (d, 16, 16)
            nn.ReLU(),
            nn.MaxPool2d(...) # (d, 8, 8)
        )
        '''
        Define a decoder with 3 transposed convlotional layers. The first two of them is following by a ReLU layer.
        That is: 2 × (ConvTranspose2d --> ReLU ) --> ConvTranspose2d 
        
        Hints: Using stride and proper kernel size to reconstruct the output size.
               Be careful about the kernel size in the first ConvTranspose2d layer.
        
        '''
        self.decoder = nn.Sequential(
            ### CODE HERE ###
            nn.ConvTranspose2d(...),   # (32, 16, 16) 
            nn.ReLU(),
            nn.ConvTranspose2d(...),  # (16, 32, 32)
            nn.ReLU(),
            nn.ConvTranspose2d(...)   # (1, 64, 64)
        )

    def forward(self, x):
        z = self.encoder(x)
        x = self.decoder(z)
        return x,z

In [None]:
# define the folder to save the visualization pictures
if not os.path.exists('./face_vis'):
    os.mkdir('./face_vis')
    os.mkdir('./face_vis/reconstruction/')
    os.mkdir('./face_vis/feature/')

#define the hyper-parameters
num_epochs = 500
batch_size = 5
learning_rate = 1e-3

# load the dataset
faces = fetch_olivetti_faces().data
faces.resize((faces.shape[0],1, 64, 64))
dataset = torch.from_numpy(faces)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

#  try different values of d in the feature dimension
d = 16

# define a model and criterion
model = autoencoder_face(d)
### CODE HERE ###
criterion = ...

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                             weight_decay=1e-5)

for epoch in range(num_epochs):
    for data in dataloader:
        img = data
        # ===================forward=====================
        ### CODE HERE ###
        output,feature = ...
        loss = ...
        
        # ===================backward====================
        ### CODE HERE ###
        ...
        ...
        ...
        
    # log
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch+1, num_epochs, loss))
    if epoch==0 or (epoch+1) % 50 == 0:
        pic_output = output
        save_image(pic_output, './face_vis/reconstruction/output_{}.png'.format(epoch+1))
        
        pic_feature = feature.view(feature.size(0), 1, d*2, -1)
        save_image(pic_feature, './face_vis/feature/feature_{}.png'.format(epoch+1), normalize=True)
        
        # visualization
        img1 = mpimg.imread('./face_vis/reconstruction/output_{}.png'.format(epoch+1))
        img2 = mpimg.imread('./face_vis/feature/feature_{}.png'.format(epoch+1))
        
        fig = plt.figure(figsize=(16, 4))
        plt.subplot(121).set_title('Reconstruction')
        plt.imshow(img1)
        plt.axis('off')
        plt.subplot(122).set_title('Encoded Feature')
        plt.imshow(img2)
        plt.axis('off')
        plt.show()

# save the trained network
torch.save(model.state_dict(), './conv_autoencoder_face.pth')
print('Finished.')

Now you can play with 'd' in PCA and the autoencoder to compare the reconstruction from PCA-eigenfaces with it from the autoencoder. 

**Question:** Should encoder and decoder have similar architecture?
