This is based on code from the following book

The follow blog post walks through what PyTorch's Optimzers are.

Pytorch comes with a module of optimizers. We can replace our vanilla gradient descent with many different ones without modifying a lot of code.

%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import torch
torch.set_printoptions(edgeitems=2, linewidth=75)

Taking our input from the previous notebook and applying our scaling

t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0,
8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,
33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u

Same model and loss function as before.

def model(t_u, w, b):
return w * t_u + b
def loss_fn(t_p, t_c):
squared_diffs = (t_p - t_c)**2
return squared_diffs.mean()
import torch.optim as optim

dir(optim)
['ASGD',
'LBFGS',
'Optimizer',
'RMSprop',
'Rprop',
'SGD',
'__builtins__',
'__cached__',
'__doc__',
'__file__',
'__name__',
'__package__',
'__path__',
'__spec__',
'lr_scheduler']
learning_rate = 1e-5
optimizer = optim.SGD([params], lr=learning_rate)

The values of our parameters are updated when we call step.

The code below forgets to zero out the gradients!

t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)
loss.backward()

optimizer.step()

params

Now we can use this snippet in a loop for training

learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)

loss.backward()
optimizer.step()

params
def training_loop(n_epochs, optimizer, params, t_u, t_c):
for epoch in range(1, n_epochs + 1):
t_p = model(t_u, *params)
loss = loss_fn(t_p, t_c)

loss.backward()
optimizer.step()

if epoch % 500 == 0:
print('Epoch %d, Loss %f' % (epoch, float(loss)))

return params
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate) # <1>

training_loop(
n_epochs = 5000,
optimizer = optimizer,
params = params, # <1>
t_u = t_un,
t_c = t_c)
Epoch 500, Loss 7.860116
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957697
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647

And we get the same loss

learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate) # <1>

training_loop(
n_epochs = 2000,
optimizer = optimizer,
params = params,
t_u = t_u, # <2>
t_c = t_c)
Epoch 500, Loss 7.612903
Epoch 1000, Loss 3.086700
Epoch 1500, Loss 2.928578
Epoch 2000, Loss 2.927646

## Training and Validation Splits#

n_samples = t_u.shape[0]
n_val = int(0.2 * n_samples)

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices  # <1>
(tensor([ 5,  9,  1,  6,  7, 10,  3,  8,  0]), tensor([2, 4]))
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]

val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u,
train_t_c, val_t_c, print_periodically=True):
val_loss_each_epoch = []
for epoch in range(1, n_epochs + 1):
train_t_p = model(train_t_u, *params) # <1>
train_loss = loss_fn(train_t_p, train_t_c)

val_t_p = model(val_t_u, *params) # <1>
val_loss = loss_fn(val_t_p, val_t_c)

val_loss_each_epoch.append(val_loss.item())

train_loss.backward() # <2>
optimizer.step()

if print_periodically and (epoch <= 3 or epoch % 500 == 0):
print(f"\tEpoch {epoch}, Training loss {train_loss.item():.4f},"
f" Validation loss {val_loss.item():.4f}")

return *params, train_loss.item(), val_loss.item(), val_loss_each_epoch
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

training_loop(
n_epochs = 3000,
optimizer = optimizer,
params = params,
train_t_u = train_t_un, # <1>
val_t_u = val_t_un, # <1>
train_t_c = train_t_c,
val_t_c = val_t_c)
Epoch 1, Training loss 85.6554, Validation loss 56.5547
Epoch 2, Training loss 43.9632, Validation loss 11.2589
Epoch 3, Training loss 36.8792, Validation loss 4.2194
Epoch 500, Training loss 7.1544, Validation loss 2.7312
Epoch 1000, Training loss 3.5517, Validation loss 2.5743
Epoch 1500, Training loss 3.1001, Validation loss 2.5225
Epoch 2000, Training loss 3.0435, Validation loss 2.5046
Epoch 2500, Training loss 3.0364, Validation loss 2.4983
Epoch 3000, Training loss 3.0355, Validation loss 2.4961

3.0354840755462646,
2.4961061477661133)

## Searching#

results = []
val_loss_over_time_by_name = {} # list of dictionaries to track validation loss for each
optimizer_names = [
'ASGD',
'RMSprop',
'Rprop',
'SGD'
]
learning_rates = [1e-4, 1e-3, 1e-2]
epochs = [500, 5000, 5000]

for optimizer_name in optimizer_names:
for learning_rate in learning_rates:
for number_of_epochs in epochs:
name = f"{optimizer_name} alpha {learning_rate} epochs {number_of_epochs}"
print(name)

optimizer = getattr(optim, optimizer_name)([params], lr=learning_rate)

learned_params = training_loop(
n_epochs = number_of_epochs,
optimizer = optimizer,
params = params,
train_t_u = train_t_un, # <1>
val_t_u = val_t_un, # <1>
train_t_c = train_t_c,
val_t_c = val_t_c,
print_periodically=True
)
beta_1, beta_0, train_loss, val_loss, val_loss_over_time = learned_params
# print(f"\tbeta_1 (weight multipled by measurement in unknown units) {beta_1}")
# print(f"\tbeta_0 (y intercept) {beta_0}")
# print(f"\ttrain_loss {train_loss}")
# print(f"\tval_loss {val_loss}")

results.append(
{
"optimizer_name": optimizer_name,
"learning_rate": learning_rate,
"number_of_epochs": number_of_epochs,
"name": name,
"w": beta_1.item(),
"b": beta_0.item(),
"train_loss": train_loss,
"val_loss": val_loss
}
)

val_loss_over_time_df = pd.DataFrame(val_loss_over_time).reset_index()
val_loss_over_time_df.columns = ["epoch", "val_loss"]

val_loss_over_time_by_name[name] = val_loss_over_time_df
df = pd.DataFrame(results)
df
sorting our dataframe by val_loss in ascending order to see who performed the best

df = df.sort_values(by=["val_loss"])
df
def show_values_on_bars(axs):
# from https://stackoverflow.com/a/51535326
def _show_on_single_plot(ax):
for p in ax.patches:
_x = p.get_x() + p.get_width() / 2
_y = p.get_y() + p.get_height()
value = '{:.2f}'.format(p.get_height())
ax.text(_x, _y, value, ha="center")

if isinstance(axs, np.ndarray):
for idx, ax in np.ndenumerate(axs):
_show_on_single_plot(ax)
else:
_show_on_single_plot(axs)

## Visualizing Loss Over Time#

val_loss_over_time_by_name
#experiment_name = "Adamax alpha 0.01 epochs 5000"