The ml_df generated will be utilized to train a model and will be checked on performance with the properties from the processed data.
The model will be using neural network of two hidden layers and same number of neurons on the layers. K-Fold validation of k=5 will be used to reduce overfitting. MSE will be used to evaluate each fold and represent the accuracy
Still will be going with the molecules that have atom number equal to 19
In [15]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import descriptors, Calculator
from molml.features import CoulombMatrix
import ast
In [16]:
# load dataset
ml_df = pd.read_csv('ml_df.csv')
df_19 = pd.read_csv('df_19.csv')
In [17]:
# Defining NN model
def nn_model(X, y, neurons=64, epochs=100, test_size = 0.2, batch_size=32, lr=0.001, random_state = 42):
# train_test_split
# X is input features, and y is target values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size = test_size, random_state = random_state
)
# Convert to tensor
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
# Reshape into a 2D column vector with 1 column and as many rows as needed
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)
# Build model
model = nn.Sequential(
nn.Linear(X_train.shape[1], neurons), # .shape[1] gets the number of input features from .shape
nn.ReLU(), # Adds non-linearity without activation function, goes in between hidden layers
nn.Linear(neurons, neurons),
nn.ReLU(),
nn.Linear(neurons, 1)
)
# Instantiate loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr) # .parameters() return all learnable parameters in model
# Training loop
for epoch in range(epochs):
model.train()
perm = torch.randperm(X_train.size(0)) # Randomly shuffles indices in training data
for i in range(0, X_train.size(0), batch_size): # Split into batches for efficiency and speed
idx = perm[i:i+batch_size]
batch_x, batch_y = X_train[idx], y_train[idx]
optimizer.zero_grad()
preds = model(batch_x)
loss = criterion(preds, batch_y)
loss.backward()
optimizer.step()
# Evaluation
model.eval()
with torch.no_grad():
preds = model(X_test)
mse = mean_squared_error(y_test.numpy(), preds.numpy())
print(f"Test MSE: {mse:.4f}")
return model, mse
In [18]:
# Test with 'mu' property
model, mse = nn_model(ml_df, df_19['mu'])
Test MSE: 0.7909