The ml_df generated will be utilized to train a model and will be checked on performance with the properties from the processed data.

The model will be using neural network of two hidden layers and same number of neurons on the layers. K-Fold validation of k=5 will be used to reduce overfitting. MSE will be used to evaluate each fold and represent the accuracy

Still will be going with the molecules that have atom number equal to 19

In [15]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import descriptors, Calculator
from molml.features import CoulombMatrix
import ast
In [16]:
# load dataset
ml_df = pd.read_csv('ml_df.csv')
df_19 = pd.read_csv('df_19.csv')
In [17]:
# Defining NN model
def nn_model(X, y, neurons=64, epochs=100, test_size = 0.2, batch_size=32, lr=0.001, random_state = 42):
    # train_test_split
    # X is input features, and y is target values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size = test_size, random_state = random_state
    )

    # Convert to tensor
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    y_train = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1) 
    # Reshape into a 2D column vector with 1 column and as many rows as needed
    X_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_test = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

    # Build model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], neurons), # .shape[1] gets the number of input features from .shape
        nn.ReLU(), # Adds non-linearity without activation function, goes in between hidden layers 
        nn.Linear(neurons, neurons),
        nn.ReLU(),
        nn.Linear(neurons, 1)
    )

    # Instantiate loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr) # .parameters() return all learnable parameters in model

    # Training loop
    for epoch in range(epochs):
        model.train()
        perm = torch.randperm(X_train.size(0)) # Randomly shuffles indices in training data
        for i in range(0, X_train.size(0), batch_size): # Split into batches for efficiency and speed
            idx = perm[i:i+batch_size]
            batch_x, batch_y = X_train[idx], y_train[idx]
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            loss.backward()
            optimizer.step()
        
    # Evaluation
    model.eval()
    with torch.no_grad():
        preds = model(X_test)
        mse = mean_squared_error(y_test.numpy(), preds.numpy())
        print(f"Test MSE: {mse:.4f}")

    return model, mse
In [18]:
# Test with 'mu' property
model, mse = nn_model(ml_df, df_19['mu'])
Test MSE: 0.7909