[ ]:

!pip install wget

Avoid Overfitting using Regularization¶

[2]:

import copy
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams["font.size"] = 20
mpl.rcParams["lines.linewidth"] = 2

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold


def seed_all(seed):
    random.seed(seed)
    np.random.seed(seed)

# for the purpose of reproduce
SEED = 0
seed_all(SEED)

Load Dataset¶

[ ]:

!python -m wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv \
    --output delaney-processed.csv

[4]:

DELANEY_FILE = "delaney-processed.csv"
TASK_COL = 'measured log solubility in mols per litre'
df = pd.read_csv(DELANEY_FILE)
print(f"Number of molecules in the dataset: {df.shape[0]}")

Number of molecules in the dataset: 1128

[5]:

df

[5]:

	Compound ID	ESOL predicted log solubility in mols per litre	Minimum Degree	Molecular Weight	Number of H-Bond Donors	Number of Rings	Number of Rotatable Bonds	Polar Surface Area	measured log solubility in mols per litre	smiles
0	Amigdalin	-0.974	1	457.432	7	3	7	202.32	-0.770	OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1	Fenfuram	-2.885	1	201.225	1	2	2	42.24	-3.300	Cc1occc1C(=O)Nc2ccccc2
2	citral	-2.579	1	152.237	0	0	4	17.07	-2.060	CC(C)=CCCC(C)=CC(=O)
3	Picene	-6.618	2	278.354	0	5	0	0.00	-7.870	c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4	Thiophene	-2.232	2	84.143	0	1	0	0.00	-1.330	c1ccsc1
...	...	...	...	...	...	...	...	...	...	...
1123	halothane	-2.608	1	197.381	0	0	0	0.00	-1.710	FC(F)(F)C(Cl)Br
1124	Oxamyl	-0.908	1	219.266	1	0	1	71.00	0.106	CNC(=O)ON=C(SC)C(=O)N(C)C
1125	Thiometon	-3.323	1	246.359	0	0	7	18.46	-3.091	CCSCCSP(=S)(OC)OC
1126	2-Methylbutane	-2.245	1	72.151	0	0	1	0.00	-3.180	CCC(C)C
1127	Stirofos	-4.320	1	365.964	0	1	5	44.76	-4.522	COP(=O)(OC)OC(=CCl)c1cc(Cl)c(Cl)cc1Cl

1128 rows × 10 columns

Polynomial Fitting¶

[6]:

X = df[["Molecular Weight", "Number of H-Bond Donors", "Number of Rings", "Number of Rotatable Bonds"]].values
y = df[TASK_COL].values.reshape(-1, 1)

[7]:

test_size = int(len(X)*0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True)

[8]:

def run_gd(X_train, y_train, X_test, y_test, lr, M,
           n_epochs=20, normalize=True, norm=None,
           lamda=1, init="zero", return_mse=True):
    assert init in ["zero", "random"]
    theta_list = []
    loss_list = []
    loss_test_list = []
    if return_mse:
        mse_list = []
        mse_test_list = []

    poly_features = PolynomialFeatures(degree=M)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.fit_transform(X_test)

    if normalize:
        scaler = StandardScaler()
        X_train_poly = scaler.fit_transform(X_train_poly)
        X_test_poly = scaler.transform(X_test_poly)
    else:
        scaler = None

    X_train_poly = np.hstack([np.ones((X_train_poly.shape[0], 1)), X_train_poly])
    X_test_poly = np.hstack([np.ones((X_test_poly.shape[0], 1)), X_test_poly])

    if init == "zero":
        theta = np.zeros(X_train_poly.shape[1]).reshape(-1, 1)
    else:
        theta = np.random.randn(X_train_poly.shape[1]).reshape(-1, 1)

    # gd fit
    for _ in range(n_epochs):
        theta_list.append(copy.deepcopy(theta))

        y_pred = X_train_poly @ theta
        loss = np.mean((y_pred - y_train).reshape(-1)**2)
        if return_mse:
            mse_list.append(loss)
        if norm is None:
            loss = loss
        elif norm == "l1":
            loss += lamda*np.sum(np.abs(theta))
        elif norm == "l2":
            loss += lamda*np.sum(theta**2)
        else:
            NotImplementedError(f"Norm {norm} is not valid normalization method")
        loss_list.append(loss)
        grad = 2*X_train_poly.T @ (X_train_poly @ theta - y_train) / y_train.shape[0]
        if norm is None:
            grad = grad
        elif norm == "l1":
            grad += lamda*np.sign(theta)
        elif norm == "l2":
            grad += lamda*2*theta
        else:
            raise NotImplementedError(f"Norm {norm} is not valid normalization method")

        theta = theta - lr * grad

        # predict and calculate rmse of test dataset
        y_test_pred = X_test_poly @ theta
        loss_test = np.mean((y_test_pred-y_test).reshape(-1)**2)
        if return_mse:
            mse_test_list.append(loss_test)
        if norm is None:
            loss_test = loss_test
        elif norm == "l1":
            loss_test += lamda*np.sum(np.abs(theta))
        elif norm == "l2":
            loss_test += lamda*np.sum(theta**2)
        else:
            NotImplementedError(f"Norm {norm} is not valid normalization method")
        loss_test_list.append(loss_test)
    if return_mse:
        return theta_list, mse_list, mse_test_list, scaler
    else:
        return theta_list, loss_list, loss_test_list, scaler

Without Regularization¶

[9]:

lr = 5e-3
n_epochs = 1500
normalize = True
norm = None # regularization in GD

orders = [0, 1, 2, 3, 4, 5]
results = {}

for idx, M in enumerate(orders):
    theta_list, loss_list, loss_test_list, scaler = run_gd(X_train, y_train, X_test, y_test, lr, M, n_epochs, normalize)
    results[M] = (theta_list, loss_list, loss_test_list, scaler)

[10]:

plt.figure(figsize=(5, 5))
plt.plot(orders, [results[M][1][-1] for M in orders], "-", color="b",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="b", label="Training")
plt.plot(orders, [results[M][2][-1] for M in orders], "-", color="r",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="r", label="Test")
plt.xlabel("Polynomial Order")
plt.ylabel("Error (MSE)")
plt.xticks(orders)
plt.legend()

[10]:

<matplotlib.legend.Legend at 0x7ac9f2eac0d0>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_13_1.png

[11]:

M = orders[-1]
theta_list, loss_list, loss_test_list, scaler = results[M]

f, ax = plt.subplots(1, 1, figsize=(5,5))

ax.plot(loss_list, c="blue", label="Training")
ax.plot(loss_test_list, c="red", label="Test")
plt.xlabel("Epoch")
plt.ylabel("Error (MSE)")
plt.legend()

[11]:

<matplotlib.legend.Legend at 0x7ac9f06a5090>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_14_1.png

L2 Norm¶

[12]:

normalize = True
norm = "l2" # regularization in GD
lamda = 1

results = {}


for idx, M in enumerate(orders):
    theta_list, loss_list, loss_test_list, scaler = run_gd(X_train, y_train, X_test, y_test,
                                                           lr, M, n_epochs, normalize, norm=norm, lamda=lamda)
    results[M] = (theta_list, loss_list, loss_test_list, scaler)

[13]:

plt.figure(figsize=(5, 5))
plt.plot(orders, [results[M][1][-1] for M in orders], "-", color="b",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="b", label="Training")
plt.plot(orders, [results[M][2][-1] for M in orders], "-", color="r",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="r", label="Test")
plt.xlabel("Polynomial Order")
plt.ylabel("Loss")
plt.xticks(orders)
plt.legend()

[13]:

<matplotlib.legend.Legend at 0x7ac9f0692210>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_17_1.png

[14]:

M = orders[-1]
theta_list, loss_list, loss_test_list, scaler = results[M]

f, ax = plt.subplots(1, 1, figsize=(5,5))

ax.plot(loss_list, c="blue", label="Training")
ax.plot(loss_test_list, c="red", label="Test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

[14]:

<matplotlib.legend.Legend at 0x7aca2a3b3710>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_18_1.png

L1 Norm¶

[15]:

normalize = True
norm = "l1" # regularization in GD
lamda = 1

results = {}

for idx, M in enumerate(orders):
    theta_list, loss_list, loss_test_list, scaler = run_gd(X_train, y_train, X_test, y_test,
                                                           lr, M, n_epochs, normalize, norm=norm, lamda=lamda)
    results[M] = (theta_list, loss_list, loss_test_list, scaler)

[16]:

plt.figure(figsize=(5, 5))
plt.plot(orders, [results[M][1][-1] for M in orders], "-", color="b",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="b", label="Training")
plt.plot(orders, [results[M][2][-1] for M in orders], "-", color="r",
         marker="o", markersize=10, markerfacecolor="none", markeredgecolor="r", label="Test")
plt.xlabel("Polynomial Order")
plt.ylabel("Loss")
plt.xticks(orders)
plt.legend()

[16]:

<matplotlib.legend.Legend at 0x7ac9f06a5850>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_21_1.png

[17]:

M = orders[-1]
theta_list, loss_list, loss_test_list, scaler = results[M]

f, ax = plt.subplots(1, 1, figsize=(5,5))

ax.plot(loss_list, c="blue", label="Training")
ax.plot(loss_test_list, c="red", label="Test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

[17]:

<matplotlib.legend.Legend at 0x7ac9f0485b50>

../../_images/examples_linear_models_Reference_Ch4_Part_1_polynomial_fitting_overfitting_add_regularization_22_1.png

[17]: