Predict log EC50s of Dual-Agonist Peptides using Convolutional Neural Network

The example in this paper is adapted from the following paper:

Puszkarska, A.M., Taddese, B., Revell, J. et al. Machine learning designs new GCGR/GLP-1R dual agonists with enhanced biological potency. Nat. Chem. (2024)

Here is the link to its original github repository PeptideModels

This paper aims to design dual-agonists peptides targeting human GCG and GLP-1 receptors. The loss function of the original model contains the MSE loss of peptide-GCGR log EC50 and the MSE loss of peptide-GLP-1R log EC50, for the purpose of multi-task learning.

[ ]:
!pip install wget seaborn tqdm

Import Packages

[2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

mpl.rcParams["font.size"] = 24
mpl.rcParams["lines.linewidth"] = 2
[3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
Device: cpu

Download & Load Dataset

[ ]:
!python -m wget https://raw.githubusercontent.com/xuhuihuang/uwmadisonchem361/refs/heads/main/CNN_training_data.csv \
--output CNN_training_data.csv
[ ]:
filename = "CNN_training_data.csv"
df = pd.read_csv(filename)
df

Saved under CNN_training_data (4).csv
pep_ID EC50_T1 EC50_LOG_T1 EC50_T2 EC50_LOG_T2 Aligned_Sequence
0 seq_pep1 3.75 -11.43 563.00 -9.25 HSQGTFTSDYSKYLDSRRAQDFVQWLEEGE
1 seq_pep2 18.50 -10.73 552.00 -9.26 HSQGTFTSDYSKYLDSRRAEDFVQWLENGE
2 seq_pep3 3.51 -11.45 252.00 -9.60 HSQGTFTSDYSKYLDSRRAEDFVQWLENT-
3 seq_pep4 50.50 -10.30 6.03 -11.22 HSQGTFTSDYSKYLDSRRAEDFVQWLVAGG
4 seq_pep5 2.87 -11.54 238.00 -9.62 HSQGTFTSDYSKYLDSRRAQDFVQWLEAEG
... ... ... ... ... ... ...
120 seq_pep121 29000.00 -7.54 9.62 -11.02 HGEGTFTSDVSSYMERQSVDEFIAWLLKGR
121 seq_pep122 29400.00 -7.53 7.94 -11.10 HGEGTFTSDVSSYMESQLVDEFIAWLLKGR
122 seq_pep123 29500.00 -7.53 29500.00 -7.53 HGEGTFTSDVSSYMEPQSTDEFIAWLLKGR
123 seq_pep124 29200.00 -7.53 598.00 -9.22 HGEGTFTSDVSSYMDFQSLVEFLAWLLKGR
124 seq_pep125 1110.00 -8.95 30200.00 -7.52 HGEGTFTSDLSKQMDFESLVLFLEWLDNG-

125 rows × 6 columns

Create PyTorch Dataset

Convert sequence to onehot embedding

onehot.png
[5]:
def seq2onehot(sequence):
    AMINOACIDS = 'ACDEFGHIKLMNPQRSTVWY'
    onehot = np.zeros((len(sequence), len(AMINOACIDS)+1))
    aa_to_idx = {aa: idx for idx, aa in enumerate(AMINOACIDS)}
    for i, aa in enumerate(sequence):
        if aa in aa_to_idx:
            onehot[i, aa_to_idx[aa]] = 1
        else:
            onehot[i, -1] = 1
    return onehot
[6]:
seq = df.iloc[0]["Aligned_Sequence"]
print("Peptide sequence:", seq)
onehot = seq2onehot(seq)
print("Onehot Embedding:")
print(onehot)
Peptide sequence: HSQGTFTSDYSKYLDSRRAQDFVQWLEEGE
Onehot Embedding:
[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[7]:
class PeptideDataset(Dataset):
    def __init__(self, df, encoder,
                 seq_col="Aligned_Sequence", target_cols=("EC50_LOG_T1", "EC50_LOG_T2")):
        self.df = df
        self.encoder = encoder
        self.seq_col = seq_col
        self.target_cols = target_cols

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sequence = row[self.seq_col]
        onehot_sequence = self.encoder(sequence)
        onehot_sequence = np.transpose(onehot_sequence)
        label = [row[col] for col in self.target_cols]
        return torch.tensor(onehot_sequence, dtype=float), torch.tensor(label, dtype=float).reshape(1, -1)

Split Dataset

[8]:
from sklearn.model_selection import train_test_split

# training/validation dataset
data_size = df.shape[0]
test_ratio = 0.10
test_size = int(data_size*test_ratio)
train_indices, test_indices = train_test_split(range(data_size), test_size=test_size, shuffle=True)
print(f"Training size: {len(train_indices)}, test size: {len(test_indices)}")
train_df, test_df = df.iloc[train_indices], df.iloc[test_indices]
Training size: 113, test size: 12
[9]:
# create dataloaders
batch_size = 10
encoder = seq2onehot
train_data = PeptideDataset(train_df, encoder=encoder)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
                                           shuffle=True, drop_last=False)
test_data = PeptideDataset(test_df, encoder=encoder)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
                                          shuffle=False, drop_last=False)
high_level_arch.png
[10]:
## uncomment the block to see one datapoint
# train_data[0]
x, y = train_data[0]
print("Shape of one datapoint:", x.shape)
print("Shape of one label:", y.shape)
Shape of one datapoint: torch.Size([21, 30])
Shape of one label: torch.Size([1, 2])

CNN Model

one_block.png cnn_arch.png
[11]:
class BaseModel(nn.Module):

    def __init__(self, in_shape, dropout=0.3):
        super().__init__()

        # Defining NN layers in Conv Block #1
        in_channels = in_shape[0]
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels, out_channels=256, kernel_size=3, stride=1, padding=0, bias=False),
            nn.ReLU(),
        )
        self.bn1 = nn.BatchNorm1d(num_features=256)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        self.dropout1 = nn.Dropout1d(p=dropout)

         # Defining NN layers in Conv Block #2
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=0, bias=False),
            nn.ReLU(),
        )
        self.bn2 = nn.BatchNorm1d(num_features=512)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        self.dropout2 = nn.Dropout1d(p=dropout)

        # Defining NN layers in Conv Block #3
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=512, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.ReLU(),
        )
        self.bn3 = nn.BatchNorm1d(num_features=128)
        self.pool3 = nn.MaxPool1d(kernel_size=2)
        self.dropout3 = nn.Dropout1d(p=dropout)

        #Explore the diemnsion in order to concatenate the data later
        self._dummy_input_shape = (1, in_channels, in_shape[1])
        self.dummy_input = torch.randn(self._dummy_input_shape)
        self.flattened_size = self._calculate_flattened_size()

        self.dropout = nn.Dropout1d(p=dropout)

        # Defining fully connected layer (128 hidden neurons)
        self.dense1 = nn.Sequential(
            nn.Linear(in_features=self.flattened_size, out_features=128, bias=True),
            nn.ReLU(),
        )

        # Defining fully connected layer (64 hidden neurons)
        self.dense2 = nn.Sequential(
            nn.Linear(in_features=128, out_features=64, bias=True),
            nn.ReLU(),
        )

    def forward(self, input_seqs):
        # Calling NN layers in Cov Block #1
        x = self.dropout1(self.pool1(self.bn1(self.conv1(input_seqs))))
        # Calling NN layers in Cov Block #2
        x = self.dropout2(self.pool2(self.bn2(self.conv2(x))))
        # Calling NN layers in Cov Block #3
        x = self.dropout3(self.pool3(self.bn3(self.conv3(x))))

        # Pooling and concatenate the data
        length = x.shape[-1]
        max_pool = nn.MaxPool1d(kernel_size=length)(x).squeeze(-1)
        avg_pool = nn.AvgPool1d(kernel_size=length)(x).squeeze(-1)
        x = torch.cat([max_pool, avg_pool], dim=-1)

        # Calling the dense NN layer #1
        x = self.dropout(self.dense1(x))
        # Calling the dense NN layer #2
        x = self.dropout(self.dense2(x))

        return x

    def _calculate_flattened_size(self):
        """Pass dummy data through the network to get the flattened size."""
        with torch.no_grad():
            x = self.dummy_input
            x = self.dropout1(self.pool1(self.bn1(self.conv1(x))))
            x = self.dropout2(self.pool2(self.bn2(self.conv2(x))))
            x = self.dropout3(self.pool3(self.bn3(self.conv3(x))))

            # max pooling and average pooling layer here turn each 30-residude peptide into a single dimension.
            # 128*1: "128" can be traced back to the original 20+1 type of amino acides
            # while "1" refers to a whole peptide in order to predict properties of whole peptides
            length = x.shape[-1]
            max_pool = nn.MaxPool1d(kernel_size=length)(x).squeeze(-1)
            avg_pool = nn.AvgPool1d(kernel_size=length)(x).squeeze(-1)
            x = torch.cat([max_pool, avg_pool], dim=-1)
        return x.shape[1]


class MultiHeadModel(nn.Module):

    def __init__(self, in_shape, dropout=0.3):
        super().__init__()

        self.base_model = BaseModel(in_shape=in_shape, dropout=dropout)
        self.head_1 = nn.Linear(in_features=64, out_features=1, bias=True)
        self.head_2 = nn.Linear(in_features=64, out_features=1, bias=True)

    def forward(self, input_seqs):

        base_out = self.base_model(input_seqs)
        out_1 = self.head_1(base_out)
        out_2 = self.head_2(base_out)

        return torch.cat([out_1, out_2], dim=1)
[12]:
def train_one_epoch(model, criterion, optimizer, dataloader):
    losses = []
    model.train()
    for x, y_true in dataloader:
        if device == "cuda":
            x, y_true = x.to(device), y_true.to(device)
        x, y_true = x.float(), y_true.float()
        optimizer.zero_grad()
        y_pred = model(x) # Forward propagation is called here.
        # MSE loss for linear regression
        loss = criterion(y_pred, y_true.squeeze(1))
        loss.backward() # Backprogation
        optimizer.step()
        losses.append(loss.cpu().detach().item())
    return losses


def val_one_epoch(model, criterion, dataloader):
    losses = []
    model.eval()
    with torch.no_grad():
        for x, y_true in dataloader:
            if device == "cuda":
                x, y_true = x.to(device), y_true.to(device)
            x, y_true = x.float(), y_true.float()
            y_pred = model(x)
            loss = criterion(y_pred, y_true.squeeze(1))
            losses.append(loss.cpu().detach().item())
    return losses

loss function

loss.png
[13]:
def multi_task_loss(y_pred, y_true):
    y1_pred, y1_true = y_pred[:, 0], y_true[:, 0]
    y2_pred, y2_true = y_pred[:, -1], y_true[:, -1]
    mse1 = nn.MSELoss()(y1_pred, y1_true)
    mse2 = nn.MSELoss()(y2_pred, y2_true)
    return 0.5 * mse1 + 0.5 * mse2

Training

[14]:
model = MultiHeadModel(in_shape=(21, 30))
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")


model = model.to(device)
model = model.float()
criterion = multi_task_loss
# using a gradient descsent based method for optimiztion
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

n_epochs = 1500
Total number of parameters: 649026
[15]:
train_loss = []
val_loss = []

for epoch in tqdm.tqdm(range(n_epochs)):
    losses = train_one_epoch(model, criterion, optimizer, train_loader)
    train_loss.append(np.mean(losses))
    losses = val_one_epoch(model, criterion, test_loader)
    val_loss.append(np.mean(losses))
100%|██████████| 1500/1500 [09:23<00:00,  2.66it/s]
[16]:
f, ax = plt.subplots(1, 1, figsize=(5,5))

ax.plot(train_loss, c="blue", label="Training")
ax.plot(val_loss, c="red", label="Test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
[16]:
<matplotlib.legend.Legend at 0x7c9f3213df10>
../../_images/examples_deep_nn_Reference_ch6_Part_1_CNNs_colab_27_1.png

Evaluation metrics

[17]:
y1_true = []
y1_pred = []
y2_true = []
y2_pred = []

model.eval()
with torch.no_grad():
    for x,y in test_loader:
        if device == "cuda":
            x = x.to(device)
        x = x.float()

        y_pred = model(x)
        y1_pred.extend([y_pred[i, 0].item() for i in range(len(y_pred))])
        y2_pred.extend([y_pred[i, 1].item() for i in range(len(y_pred))])

        y = y.reshape(y_pred.shape)
        y1_true.extend([y[i, 0].item() for i in range(len(y))])
        y2_true.extend([y[i, 1].item() for i in range(len(y))])
[18]:
tmp_df = pd.DataFrame({"y1": y1_true, r"$\hat{y_1}$": y1_pred,
                       "y2": y2_true, r"$\hat{y_2}$": y2_pred})

# scatter plot
g = sns.JointGrid(x="y1", y=r"$\hat{y_1}$", data=tmp_df)
g = g.plot_joint(plt.scatter, c="green", alpha=0.5)

# line: y_pred = y
y_line = np.linspace(np.min(y1_true), np.max(y1_true), 200)
g.ax_joint.plot(y_line, y_line, color="blue", linestyle="--");

# histograms
g = g.plot_marginals(sns.histplot, data=df, color="green", kde=False)

g.ax_joint.set_xlim(np.min(y_line), np.max(y_line))
g.ax_joint.set_ylim(np.min(y_line), np.max(y_line))

plt.show()
../../_images/examples_deep_nn_Reference_ch6_Part_1_CNNs_colab_31_0.png
[19]:
print(f"MSE: {mean_squared_error(y1_true, y1_pred):.2f}")
print(f"Coefficient of determination: {r2_score(y1_true, y1_pred):.2f}")
MSE: 0.85
Coefficient of determination: 0.64
[20]:
# scatter plot
g = sns.JointGrid(x="y2", y=r"$\hat{y_2}$", data=tmp_df)
g = g.plot_joint(plt.scatter, c="blue", alpha=0.5, label="GLP-1R")

# line: y_pred = y
y_line = np.linspace(np.min(y2_true), np.max(y2_true), 200)
g.ax_joint.plot(y_line, y_line, color="blue", linestyle="--");

# histograms
g = g.plot_marginals(sns.histplot, data=df, color="blue", kde=False)

g.ax_joint.set_xlim(np.min(y_line), np.max(y_line))
g.ax_joint.set_ylim(np.min(y_line), np.max(y_line))

plt.show()
../../_images/examples_deep_nn_Reference_ch6_Part_1_CNNs_colab_35_0.png
[21]:
print(f"MSE: {mean_squared_error(y2_true, y2_pred):.2f}")
print(f"Coefficient of determination: {r2_score(y2_true, y2_pred):.2f}")
MSE: 0.63
Coefficient of determination: 0.67
[21]: