In [None]:
import os, random
import numpy as np
import torch
import dataset_loader

import torch
import torch.nn as nn
import torch.nn.functional as F

# Lab exercises 3

The goal of this lab exercise is to help you learn how to use Pytorch, which you will need to use for the project.

It is important the you read the documentation to understand how to use Pytorch functions, what kind of transformation they apply etc. You have to take time to read it carefully to understand what you are doing.

- https://pytorch.org/docs/stable/nn.html
- https://pytorch.org/docs/stable/torch.html

I cannot stress it enough: **READ THE FU\*KING MANUAL**. Each time you use a function, check the manual, even if you think that you know how it works. If you don't read it, you are just making stup\*d decisions.

# 1. Pytorch basics

Instead of manipulating numpy arrays, we will manipulate pytorch tensors.
A lot of things are defined in the same way, except the you can use autograd!

Note that when using pytorch and the autograd mechanism, you want to avoid in-place operations, for reason I didn't have time cover in the course, sorry. :(
The only time you will need in place operation in this course is for parameter initialization.
It is easy to identify in place operations: their function name ends with an underscore!

In [None]:
# create a tensor of shape (2, 4) containing random values.
# by default, it will be a float tensor and will not ask for gradient

t = torch.rand(2, 4)

# you can also create a tensor full of 0 or 1
t_zeros = torch.zeros(2, 4)
t_ones = torch.ones(2, 4)

print(t)
print(t_zeros)
print(t_ones)

In [None]:
# there also exists *_like functions that creates a tensor with exactly
# the same properties as its argument (shape, gradient requirement, type, etc)

t2 = torch.rand_like(t)
t2_zeros = torch.zeros_like(t)
t2_ones = torch.ones_like(t)

print(t2)
print(t2_zeros)
print(t2_ones)

In [None]:
# you can also create a tensor of long values (i.e. integers),
# which will be usefull to represent labels :)

t_zeros_long = torch.zeros(10, dtype=torch.long)
print(t_zeros_long)

print(t_zeros_long.dtype, t_zeros.dtype)

In [None]:
# you can also initialize the tensor with values
t_long = torch.LongTensor([0,1,10,20])
print(t_long)

In [None]:
# now, let's turn to the serious stuff: gradient computation! :)

t = torch.rand(2, 10)

# by default no gradient will be required for t :(
print(t.requires_grad)

# so we ask for it explicitly (note the underscore: in place operation!)
t.requires_grad_(True)
print(t.requires_grad)

# We can also set this to true at creation
t = torch.rand(2, 10, requires_grad=True)
print(t.requires_grad)

# now, let's do a stupid operation and compute the gradient
# this sum over all element of t,
# it return a tensor with a single value
z = t.sum()
print(z.shape, z.requires_grad)

# backpropagation!
z.backward()

# print the gradient of t,
# it should be a vector full of 1, do you understand why?
print(t.grad)

In [None]:
# if I call backward a second time, it will accumulate the gradient
# so it will be a tensort full of 2
z.backward()
print(t.grad)

# we can reset the gradient of the tensor to zero
# => note the in-place operation
t.grad.zero_()
print(t.grad)

# and if we backprop again, it will be a tensor full of one again
z.backward()
print(t.grad)

# and this highlight one of the major source of bug in pytorch:
# Do not forget to reset your gradients!

In [None]:
# similarly to the previous lab exercise,
# the parameter of your network must be encapsulated in a Parameter object:
# https://pytorch.org/docs/stable/generated/torch.nn.parameter.Parameter.html
# You should understand why :)
#
# however, Pytorch comes with a lot of modules already made!
# they are in the torch.nn that we often just rename as nn

# https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#torch.nn.Linear
linear = nn.Linear(10, 20)

# parameter of the linear transformation:
# projection matrix W and bias
# look, its a parameter object!
print(type(linear.weight))
print(type(linear.bias))

# the Linear class is not a subtype of parameter
# but of Module, which represent a network part.
print(type(linear), isinstance(linear, nn.Module))

# so remember that we often want to compute values on batches.
# in pytorch, datapoints will be rows instead of columns
# (contrary to the two previous lab exercises with numpy!)
# So, for example, we can create a batch with two random elements,
# each one of size 10, i.e. the input shape is (2, 10)
t_inputs = torch.rand(2, 10)
print(t_inputs.shape)

# compute the hidden representation after the linear transformation!
# note that we use the object as a function for this
t_outputs = linear(t_inputs)
print(t_outputs.shape)

# Data loading and conversion

In [None]:
# Download mnist dataset 
if("mnist.pkl.gz" not in os.listdir(".")):
    !wget http://deeplearning.net/data/mnist/mnist.pkl.gz

# if you have it somewhere else, you can comment the lines above
# and overwrite the path below
mnist_path = "./mnist.pkl.gz"

# load the 3 splits
train_data, dev_data, test_data = dataset_loader.load_mnist(mnist_path)

def build_torch_inputs(data):
    x, y = data
    ret = list()
    
    for i in range(x.shape[0]):
        input_tensor = torch.from_numpy(x[i]).reshape(1, -1)
        output_value = int(y[i])
        
        ret.append({
            "input_tensor": input_tensor,
            "output_value": output_value
        })
        
    return ret
        
train_data = build_torch_inputs(train_data)
dev_data = build_torch_inputs(dev_data)
test_data = build_torch_inputs(test_data)

In [None]:
# train_data is a list,
# each element is a dictionnary with two keys:
# - input_tensor: the input image as a row vector
# - output_value: the gold label

print(train_data[10]["input_tensor"].shape)
print(train_data[10]["output_value"])

In [None]:
# Instead of computing the loss on a single input or on the full dataset,
# it is more common to compute it on a subset of the data, called a batch or minibatch.
# For example, if the we use a batch of size 10,
# the input of the network will be a tensor of shape (10, 784)
# where each row is a single input.
#
# In the data, we already transformed each input into a row vector,
# so we only need to concatenate them.
#
# Here we who an example, of course this is done dynamically during training

# Constructing a batched input
batch_size = 10
first_element = 20 # index in the training set of the first element of the batch

# the cat() function concatenates a list of tensor along a dimension
batch_input = torch.cat(
    [
        data["input_tensor"]
        for data in train_data[first_element:first_element + batch_size]
    ],
    # we want to concatenate on the batch dimension,
    # i.e. the first dimension
    dim=0
)
print(batch_input.shape)  # batch of ten flat images (10, 784)

In [None]:
# just a helper function
def build_batch(data):
    batch_inputs = torch.cat(
        [data["input_tensor"] for data in data],
        dim=0
    )

    labels = torch.LongTensor([data["output_value"] for data in data ])
    
    return batch_inputs, labels

# Network definition and training

In [None]:
# A network network is a class extending nn.Module
class MLPClassifier(nn.Module):
    # constructor, you can define any argument you need
    # to parameterize your network
    def __init__(self, input_dim, hidden_dim, output_dim):
        # you must always call the parent constructor,
        # other it will fail when you will run the network :)
        super().__init__()
        
        # Create the projections:
        # - the first one project from input to hidden space
        # - the second one from hidden space to output space (i.e. logits, weights of each class)
        # Note that if you want to use list or dictionnaries instead of directly
        # setting attributes of the object, you need to use special containers:
        # https://pytorch.org/docs/stable/nn.html#containers
        # do you understand why?
        self.hidden_proj = nn.Linear(input_dim, hidden_dim)
        self.output_proj = nn.Linear(hidden_dim, output_dim)
                
        # custom initialization
        # note that:
        # - we encapsulate in torch.no_grad() to disable autograd here
        # - we use inplace functions (i.e. with an underscore at the end)
        with torch.no_grad():
            torch.nn.init.kaiming_uniform_(self.hidden_proj.weight.data)
            torch.nn.init.kaiming_uniform_(self.output_proj.weight.data)
            
            self.hidden_proj.bias.zero_()
            self.output_proj.bias.zero_()
        
    # the forward function is the one that will be called to compute outputs.
    # note that we never call it directly:
    # we will use the object as a function, as with linear layers
    def forward(self, inputs):
        # first proj
        z = self.hidden_proj(inputs)
        # apply relu
        z = torch.relu(z)

        # apply output proj and return
        return self.output_proj(z)

In [None]:
# Example of a training loop! :)

# hyper-parameters
n_epochs = 10
batch_size = 10

# Build the network
network = MLPClassifier(784, 200, 10)

# Build the optimizer, i.e. the object that will update parameters
# using the gradient information.
# 
# SGD is standard gradient descent, but there are many alternative!
# https://pytorch.org/docs/stable/optim.html
#
# The first argument of an optimizer is the set of parameters if will update,
# we can use network.parameters() to get all the parameters of our network

# set momentum=0 for standard gradient descent
optimizer = torch.optim.SGD(network.parameters(), lr=1e-3, momentum=0.9)

# Adam is a very good alternative.
#optimizer = torch.optim.Adam(network.parameters())


for epoch in range(n_epochs):
    print("%i / %i" % (epoch+1, n_epochs))
    
    # shuffle the dataset
    # its a good practice to do this at the beginning of each epoch
    random.shuffle(train_data)
    
    # pass the network in training mode,
    # i.e. dropout will be applied if the dropout module is called
    network.train()
    
    for first_element in range(0, len(train_data), batch_size):
        # IMPORTANT
        # as gradient is accumulated, we need to set all gradients to 0
        # there are several ways of doing that,
        # the simplest is to call optimizer.zero_grad()
        # that set all parameters tracked by the optimizer to 0
        optimizer.zero_grad()
        
        # build our batched input
        batch_input, labels = build_batch(train_data[first_element:first_element + batch_size])
        
        # compute the output weights/logits
        logits = network(batch_input)
        
        # compute the loss
        # https://pytorch.org/docs/stable/nn.functional.html#cross-entropy
        # the torch.nn.functional packages (renamed as F) contains many
        # useful functions that are not network subpart (neither parameters or modules)
        loss = F.cross_entropy(logits, labels)
        
        # compute the gradient
        loss.backward()

        # update parameters wrt to gradient information!
        optimizer.step()
        
    # at the end of each epoch we evaluate on dev
    # eval on dev data
    n_correct = 0
    # disable auto-grad as we don't need that during evaluation
    # this speed things a little bit + use less memory
    with torch.no_grad(): 
        # pass network in eval mode,
        # i.e. if the dropout module is called,
        # it won't be applied
        network.eval()
        
        for first_element in range(0, len(dev_data), batch_size):
            optimizer.zero_grad()

            batch_input, labels = build_batch(dev_data[first_element:first_element + batch_size])

            logits = network(batch_input)
            
            # logits is a tensor of shape (batch dim, n labels),
            # to compute the prediction we just compute the argmax
            # along the label dimension
            prediction = logits.argmax(dim=1)
            
            # compare prediction to gold and add to the counter
            # Be carefull: the .item() is used to get a float value
            # instead of a pytorch tensor
            n_correct += (prediction == labels).sum().item()
        
    print("Dev acc: %.2f" % (100 * n_correct / len(dev_data)))

# TODO

The goal of this lab exercise is that you play a little bit with the code above so you can learn how to use pytorch.
I list here a sequence of things that you should be able to implement.

It is really important that you learn how to do that, it will be important for the project.
Of course you need to create the network variant and test it. :)

### 1. Regularization

You can try two types of regularization (they can be combined together):

- weight decay: it is a parameter of the optimizer
- dropout

For dropout, you need to create a dropout layer as part of your network. :)
It will be automatically enabled/disabled when you call network.train()/.eval().
https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html#torch.nn.Dropout

In [None]:
t = torch.ones(2, 4)
print(t)

dropout = nn.Dropout(0.5)

# activate train mode
dropout.train()
t2 = dropout(t)

print(t2)

dropout.eval()
t3 = dropout(t)
print(t3)

# WARNING => of course you don't directly call these functions in the dropout object,
# but instead you call the one of the network that will recursively call it to all
# its module attributes!

A commong trick for training neural networks is gradient clipping: if the norm of the gradient is too big, we rescale the gradient. This trick can be used to prevent exploding gradients and also to make "too big steps" in the wrong direction due the use of approximate gradient computation in SGD

In [None]:
batch_loss.backward()  # compute gradient
torch.nn.utils.clip_grad_value_(network.parameters(), 5.)  # clip gradient if its norm exceed 5
optimizer.step()  # update parameters

### 2. Deeper network

The second exercise will be to create a deep network!

We will explore 2 different ways of doing that.

**(1)** The most simple technique is to build a list of linear projection in the constructor and set it as an attribute of the network. But **warning**: you should not use a Python list directly, you must instead use a nn.ModuleList(). Luckily, it works as a list: you can append objects and loops of the content. You will also need to update the initialization (to do a loop over all layers!) and the forward pass.

In [None]:
# Example 1
class DeepMLPClassifier1(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_hidden_layers):
        super().__init__()
        
        # TODO...
            
        self.output_proj = nn.Linear(d, output_dim)
        self.dropout = nn.Dropout(0.5)
        
        with torch.no_grad():
            torch.nn.init.kaiming_uniform_(self.output_proj.weight.data)
            self.output_proj.bias.zero_()
            
            # TODO
        
    def forward(self, inputs):
        # TODO...

        return self.output_proj(z)
    
network = DeepMLPClassifier1(100, 200, 10, 3)

# small check that must pass,
# but you should also train it correctly to see if results improve!
batch = torch.rand(10, 100)
output = network(batch)
print(output.shape)

**(2)** The second technique is based on a nn.Sequential() object: https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html#torch.nn.Sequential

The idea behind a Sequential() object is that it is a list of sub-modules. When you call the object, it will just execute one module after the other, passing as input of the next one the result of the previous one.

Here is an example on how to use this to construct a single projection with non-linearity and dropout:

In [None]:
# Example of usage: we define a projection as a Sequential object
seq = nn.Sequential(
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Dropout(0.5)  
)

# batched input
inputs = torch.rand(3, 10)

# will call successively the 3 subnetworks,
# i.e. it will apply linear transformation,
# then relu and then dropout
outputs = seq(inputs)
print(outputs.shape)

Unfortunately, it is a little bit more difficult to create than nn.Sequential() because it doesn't have an append() method... but you can use list comprehension + transform the list as a sequence of argument to the constructor.

In [None]:
# the list is a single argument
print([1, 2, 3])

# here we call print with 3 different arguments,
# notice how the output is different
print(1, 2, 3)

# so, how do we call a function by passing the values
# from a list as separate argument?
# Well, like this:
print(*[1, 2, 3])

# notive that this last output is similar to the second one,
# and different from the first! :)

In [None]:
# Example 2
class DeepMLPClassifier2(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_hidden_layers):
        super().__init__()
        
        # TODO
        
        self.output_proj = nn.Linear(input_dim if n_hidden_layers == 0 else hidden_dim, output_dim)
                
        with torch.no_grad():
            torch.nn.init.kaiming_uniform_(self.output_proj.weight.data)
            self.output_proj.bias.zero_()
            
            #TODO
            
        
    def forward(self, inputs):
        # TODO
        return self.output_proj(z)
    
network = DeepMLPClassifier2(100, 200, 10, 3)

# small check that must pass,
# but you should also train it correctly to see if results improve!
batch = torch.rand(10, 100)
output = network(batch)
print(output.shape)

### 3. Custom module

Now, instead of using the Linear layer, we are going to implement a custom submodule that will:

- apply an linear/affine transformation
- apply a non-linearity
- apply dropout

However, we will do that **wihout** nn.Linear(). Remember that matrix multiplication is done with operator @.

Instead, you need to use you own parameters for the projection matrix and bias.
In the course example and in the previous lab, we use a projection defined as follows: Ax + b where x is the input, A the projection matrix and b the bias vector. However, this assume that the input is a column vector, or, if batched, a matrix where each input is a column of x.

In Pytorch, we use a different format: a single input is a row vector, and a batched input is a matrix where each row in a different input data. So first, let's think a little bit:

- How is the linear projection defined in this case? (no batch, just a single row vector x as input)
- what is the shape of A? of b?
- in the case of a batched input, you need to be careful so that broadcasting is applied correctly. Think about this and what is implies for parameters shape.

To create a parameter in the constructor, you can do: self.whatever = nn.parameter.Parameter(torch.empty(..., ...))

it create a tensor that is unitialized!


**ANSWERS**

xA + b with shapes:

- x: (1, input dim)
- A: (input dim, output dim)
- b: (1, outputdim)


In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        
        # TODO
            
    def forward(self, inputs):
        return # TODO


class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_hidden_layers):
        super().__init__()
        
        # TODO
            
        self.output_proj = nn.Linear(input_dim if n_hidden_layers == 0 else hidden_dim, output_dim)
                
        with torch.no_grad():
            torch.nn.init.kaiming_uniform_(self.output_proj.weight.data)
            self.output_proj.bias.zero_()            
        
    def forward(self, inputs):
        # TODO
        return self.output_proj(z)
    
network = MLPClassifier(100, 200, 10, 3)

# small check that must pass,
# but you should also train it correctly to see if results improve!
batch = torch.rand(10, 100)
output = network(batch)
print(output.shape)

###  4. Convolutional Neural Network

Build a network that relies on a CNN instead of a MLP to classify MNIST images
(you can still have a single layer MLP on top of convolutions, after pooling!).
Note that this will requires you to reshape the input images!

https://pytorch.org/docs/stable/nn.html#torch.nn.Conv2d

In [None]:
t = torch.rand((10, 100))  # t is batch of 10 "flat" pictures
t = t.reshape(10, 10, 10)  # we reshape t so each batch contains a 10x10 picture that is not flat