In [ ]:
# import libraries for EDA
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import descriptors, Calculator
from molml.features import CoulombMatrix
In [2]:
# Load dataset from Data wrangling
df = pd.read_csv('processed_data.csv')
df.head()
Out[2]:
A B C mu alfa homo lumo gap R² zpve U0 U H G Cv smiles mol num_atoms num_heavy_atoms canonical_smile
0 3.69951 1.23093 1.09709 5.0509 78.30 -0.2676 0.0141 0.2818 1120.8417 0.137571 -380.79782 -380.79050 -380.78955 -380.82970 28.666 N#CC1NC11C2CC1C2 <rdkit.Chem.rdchem.Mol object at 0x3abc242e0> 17 9 N#CC1NC12C1CC2C1
1 3.58161 1.13798 0.87264 2.5256 55.40 -0.2868 -0.0597 0.2271 1180.5099 0.064339 -525.78340 -525.77594 -525.77496 -525.81620 25.955 OC1=NON=C1OC=O <rdkit.Chem.rdchem.Mol object at 0x3abc24510> 11 9 O=COc1nonc1O
2 2.35658 1.75261 1.52228 1.5279 83.85 -0.2229 0.0807 0.3036 1029.1179 0.196525 -367.18655 -367.17874 -367.17780 -367.21826 32.345 CC12CCC1CN1CC21 <rdkit.Chem.rdchem.Mol object at 0x3abc24580> 22 9 CC12CCC1CN1CC12
3 2.20077 1.50794 0.92898 2.1100 72.11 -0.2575 -0.0123 0.2451 1230.8475 0.136089 -455.19177 -455.18280 -455.18185 -455.22604 32.000 CN=C1OC(=O)CC1N <rdkit.Chem.rdchem.Mol object at 0x3abc245f0> 17 9 CN=C1OC(=O)CC1N
4 2.43658 1.01176 0.74884 2.7860 75.68 -0.2791 0.0257 0.3048 1496.1036 0.168867 -440.29135 -440.28058 -440.27963 -440.32758 37.969 CC(CO)C(CO)C#N <rdkit.Chem.rdchem.Mol object at 0x3abc24660> 20 9 CC(CO)C(C#N)CO

Questions to ask in EDA:

  1. What are the main clusters according to the atomic number?
  2. How are the properties distributed in these clusters?
  3. What is the distribution of heavy atoms in each molecule?
  4. How are the properties distributed according to the number of heavy atoms?
  5. What molecular descriptors can be used to represent the molecules?
  6. What type of molecular descriptors yelds the best property prediction?
In [3]:
# Checking the counts of molecules, depending on their atom counts
df['num_atoms'].value_counts().sort_index().plot.bar(figsize=(10,6), rot=0, edgecolor='k')
plt.title("Frequency of molecules according to their number of atoms")
plt.xlabel('Number of atoms')
plt.ylabel('Absolute frequency')
plt.show()

# Kinda follows normal distribution
No description has been provided for this image
In [4]:
# Checking property distribution along the number of atoms in the molecule
plt.figure(figsize=(16, 18))
plot_counter = 1
for prop in df.iloc[:,:15].columns:
    plt.subplot(5, 3, plot_counter)
    plt.scatter(df['num_atoms'],df[prop],s=40, alpha = 0.5) # s = size of the marker
    for atom in np.unique(df['num_atoms']):
        plt.scatter(atom, df[prop][df['num_atoms'] == atom].mean(), c='k',label='mean') # c = color
        # Plotting the mean value of a property for molecules that have a specific number of atoms

    handles, labels = plt.gca().get_legend_handles_labels() # gca = get current axis, manually captures the current legend entries 
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.title(prop)
    plot_counter = plot_counter + 1
plt.suptitle('Number of atoms vs properties', x=0.5, y=0, ha='center', size=16) 
plt.tight_layout()
plt.show()
No description has been provided for this image
In [5]:
# Checking the counts of molecules, depending on their heavy atom counts
df['num_heavy_atoms'].value_counts().sort_index().plot.bar(figsize=(10,6), rot=0, edgecolor='k')
plt.title("Frequency of molecules according to their number of heavy atoms")
plt.xlabel('Number of heavy atoms')
plt.ylabel('Absolute frequency')
plt.show()
No description has been provided for this image
In [6]:
# Distribution of properties based on number of heavy atoms
plt.figure(figsize=(16, 18))
plot_counter = 1
for prop in df.iloc[:,:15].columns:
    plt.subplot(5, 3, plot_counter)
    plt.scatter(df['num_heavy_atoms'],df[prop],s=40, alpha = 0.5) # s = size of the marker
    for atom in np.unique(df['num_heavy_atoms']):
        plt.scatter(atom, df[prop][df['num_heavy_atoms'] == atom].mean(), c='k',label='mean') # c = color
        # Plotting the mean value of a property for molecules that have a specific number of atoms

    handles, labels = plt.gca().get_legend_handles_labels() # gca = get current axis, manually captures the current legend entries 
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.title(prop)
    plot_counter = plot_counter + 1
plt.suptitle('Number of heavy atoms vs properties', x=0.5, y=0, ha='center', size=16) 
plt.tight_layout()
plt.show()
No description has been provided for this image

Numerical values of descriptors quantify pyhsiochemical information of molecule. (Somewhat like referece) Two types of descriptors:

  • Experimental: values measured in labs
  • Theoretical: values calculated from molecular structure --> data from SMILES strings

Will use SMILES to calculate the theoretical descriptors and compare with the experimental descriptors (properties) that were loaded on the dataset. Inputing the Mol object (that was derived from SMILES string) into the Caclulator function, returns SMILES derived theoretical descriptors. The theoretical descriptors will be stored on a seperate csv file.

In [13]:
# Due to memory issue, will focus on molecules with 19 atoms
#df['mol'] = df['smiles'].apply(Chem.MolFromSmiles) # re-instantizing mol object, not sure why not work without..
df_19 = df[df['num_atoms'] == 19]
mol_19 = df_19['mol'].to_list()

calc = Calculator(descriptors, ignore_3D=True) # Initializing Mordred descriptor calculator
# Skipping 3D because SMILES string do not contain 3D structure by default

# calc input is expected to be list-like iterables of RDKitMol obejcts
# pandas() function calculates molecular descriptors for a list of RDKit Mol objects and returns the result as a pandas DataFrame.
df_desc = calc.pandas(mol_19, nproc=1)

# Save the results in csv file
df_desc.to_csv("descriptors.csv")
In [ ]:
df_desc.info()
<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 18333 entries, 0 to 18332
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(943), int64(326), object(342)
memory usage: 225.4+ MB
In [ ]:
df_desc.loc[:,df_desc.dtypes == object] # Filters for columns (theoretical descriptors) that are non-numerical objects
# Non-numerical descriptors are typically not wanted in machine learning
# Can be thought as "failed descriptors"
Out[ ]:
ABC ABCGG AATS6dv AATS7dv AATS8dv AATS6d AATS7d AATS8d AATS6s AATS7s ... MDEC-44 MDEO-11 MDEO-12 MDEO-22 MDEN-11 MDEN-12 MDEN-13 MDEN-22 MDEN-23 MDEN-33
0 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 2.25 invalid value encountered in scalar divide (AA... ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) 0.333333 float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
1 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 4.0 invalid value encountered in scalar divide (AA... ... float division by zero (MDEC-44) 0.25 float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
2 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) 1.144714 float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
3 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 0.0 invalid value encountered in scalar divide (AA... 1.4 1.0 invalid value encountered in scalar divide (AA... 2.0 1.0 ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
4 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... ... 1.0 float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18328 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.869565 0.0 0.0 1.521739 1.133333 1.0 2.362319 2.4 ... float division by zero (MDEC-44) 0.2 0.816497 float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
18329 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 0.0 invalid value encountered in scalar divide (AA... 1.0 1.0 invalid value encountered in scalar divide (AA... 2.428571 1.0 ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
18330 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... ... float division by zero (MDEC-44) float division by zero (MDEO-11) 0.333333 float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) float division by zero (MDEN-23) float division by zero (MDEN-33)
18331 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... invalid value encountered in scalar divide (AA... 1.0 invalid value encountered in scalar divide (AA... ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) 0.25 float division by zero (MDEN-23) float division by zero (MDEN-33)
18332 module 'numpy' has no attribute 'float'.\n`np.... module 'numpy' has no attribute 'float'.\n`np.... 0.0 0.0 invalid value encountered in scalar divide (AA... 1.0 1.0 invalid value encountered in scalar divide (AA... 1.888889 1.0 ... float division by zero (MDEC-44) float division by zero (MDEO-11) float division by zero (MDEO-12) float division by zero (MDEO-22) float division by zero (MDEN-11) float division by zero (MDEN-12) float division by zero (MDEN-13) float division by zero (MDEN-22) 0.5 float division by zero (MDEN-33)

18333 rows × 342 columns

In [ ]:
# Will be filtering only numerical descriptors and saving them as floats
df_desc = df_desc.select_dtypes(include=np.number).astype('float32')

# Remove zero-variance descriptors; meaning they have the same value for every molecule, and this is useless
df_desc = df_desc.loc[:, df_desc.var() > 0.0]

# Normalize descriptors
ml_df = pd.DataFrame(MinMaxScaler().fit_transform(df_desc), columns = df_desc.columns)
       nAcid  nBase   SpAbs_A   SpMax_A  SpDiam_A    SpAD_A   SpMAD_A  \
0        0.0   0.25  0.632654  0.282702  0.312114  0.632654  0.520152   
1        0.0   0.00  0.577712  0.344103  0.324511  0.577712  0.448385   
2        0.0   0.00  0.718230  0.461089  0.392428  0.718230  0.631937   
3        0.0   0.25  0.445863  0.411582  0.285618  0.445863  0.497206   
4        0.0   0.00  0.774736  0.882543  0.615138  0.774736  0.705747   
...      ...    ...       ...       ...       ...       ...       ...   
18328    0.0   0.00  0.655759  0.345513  0.261054  0.655759  0.550333   
18329    0.0   0.00  0.606835  0.561208  0.619596  0.606835  0.486427   
18330    0.0   0.00  0.587318  0.550382  0.607644  0.587318  0.460932   
18331    0.0   0.25  0.684768  0.514773  0.523119  0.684768  0.588227   
18332    0.0   0.00  0.574154  0.342334  0.275092  0.574154  0.443737   

        LogEE_A     VE1_A     VE2_A  ...     SRW10    TSRW10        MW  \
0      0.406013  0.762119  0.432482  ...  0.362848  0.175078  0.577790   
1      0.422122  0.644515  0.350093  ...  0.387277  0.512834  0.666350   
2      0.573866  0.768991  0.437295  ...  0.487907  0.692720  0.755160   
3      0.375299  0.154861  0.307297  ...  0.402668  0.625277  0.288895   
4      0.874278  0.770940  0.438661  ...  0.848614  0.944406  0.577540   
...         ...       ...       ...  ...       ...       ...       ...   
18328  0.476172  0.351128  0.144559  ...  0.351611  0.617624  0.755160   
18329  0.561110  0.661877  0.362256  ...  0.671559  0.275610  0.577540   
18330  0.560150  0.673702  0.370541  ...  0.660839  0.272932  0.666350   
18331  0.600723  0.770413  0.438292  ...  0.589380  0.703374  0.666600   
18332  0.421757  0.699533  0.388636  ...  0.369236  0.517091  0.666600   

            AMW     WPath      WPol   Zagreb1   Zagreb2  mZagreb1  mZagreb2  
0      0.577790  0.595238  0.727273  0.310345  0.230769  0.445623  0.755556  
1      0.666350  0.607143  0.545455  0.344828  0.250000  0.462865  0.700000  
2      0.755160  0.523810  0.727273  0.482759  0.384615  0.293103  0.533333  
3      0.288895  0.404762  0.272727  0.275862  0.211538  0.415119  0.500000  
4      0.577540  0.369048  0.454545  0.896552  0.865385  0.320955  0.238889  
...         ...       ...       ...       ...       ...       ...       ...  
18328  0.755160  0.773810  0.545455  0.310345  0.230769  0.445623  0.755556  
18329  0.577540  0.547619  0.636364  0.517241  0.442308  0.409814  0.588889  
18330  0.666350  0.523810  0.727273  0.517241  0.442308  0.409814  0.533333  
18331  0.666600  0.535714  0.818182  0.517241  0.432692  0.409814  0.500000  
18332  0.666600  0.583333  0.636364  0.344828  0.259615  0.562334  0.688889  

[18333 rows x 1067 columns]
In [27]:
# Saving ml_df as a training dataset
ml_df.to_csv("ml_df.csv")
In [28]:
# Saving property csv for molecules that have atom counts equal to 19
df_19.to_csv('df_19.csv')