Different types of molecule representation:
- .xyz format: shows the atomic cartesian coordinates of each atom in a molecule
- (x,y,z) coordinate of each atom in a molecule
- .smi (SMILES) format: shows the the structure of a molecule in a 2D format
- rules:
- Atoms are represented by its atomic symbol
- If consist of two letters, the following is in lower case
- Lower case is also used for aromatic rings
- Single bonds are not represented
- Double bonds are represented with =
- Triple bonds are represented with #
- Hydrogens are implicit
- If notify one Hydrogen, it is the only hydrogen in the structure
- Branches are represented in parenthesis
- Rings are represented with numbers, of where the ring starts and close
- Atoms are represented by its atomic symbol
- rules:
- fingerprint
- columb matrix ...
Discriptors: numeric value that describes property of a molecule e.g. MW of 46.07, LogP of 0.5
- Imposes instrinsic meanings to the numbers
InĀ [3]:
# import libraries for data wrangling:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from rdkit import Chem
from rdkit.Chem import Draw
from mordred import descriptors, Calculator
from molml.features import CoulombMatrix
InĀ [4]:
# Check number of XYZ files in directory
directory = "Data/dsgdb9nsd.xyz"
len(os.listdir(directory))
# Sample XYZ file for testing
sample_file = os.listdir(directory)[0] # gives out an abitrary file list, so ultimately is random
with open(os.path.join(directory, sample_file), 'r') as f:
content = f.readlines() # readlines() gives a list of the lines in XYZ file
for line in content:
print(line)
# SMILES representation of the molecule is in the line n + 4
smile_example = Chem.MolFromSmiles(content[17 + 3].split()[0]) # Since line starts from 0, we need to add 3 to the line number
smile_example
17 gdb 35176 3.69951 1.23093 1.09709 5.0509 78.3 -0.2676 0.0141 0.2818 1120.8417 0.137571 -380.797809 -380.79051 -380.789566 -380.829714 28.666 N -0.1158211823 -1.1588814083 0.0253875797 -0.33752 C -0.0634755918 -0.0038135736 0.037200671 0.335786 C 0.0110718662 1.4360892201 0.0060552095 -0.351511 N 0.5872156127 2.0980870138 1.2241547256 -0.40181 C 1.3066483466 2.1572458324 -0.0191184789 0.488704 C 2.7569832432 1.7502741759 -0.3116380498 -0.12094 C 2.4855003157 2.2287618766 -1.7724735543 -0.215041 C 1.8365633238 3.3230156624 -0.8677898469 -0.150471 C 3.1222090314 3.2335825367 0.0190305135 -0.223719 H -0.8402848187 1.9119567184 -0.4739629271 0.133233 H 0.0720428663 2.9567193044 1.4093287021 0.24383 H 3.2686458366 0.8463850226 0.0131731084 0.100189 H 3.3646646552 2.5453426122 -2.3375094804 0.102145 H 1.8106735614 1.6191840124 -2.3774773862 0.106847 H 1.2994311493 4.2188724372 -1.1813921281 0.081344 H 3.0295061775 3.5421096693 1.06234023 0.108438 H 4.0399128268 3.6114038773 -0.4376163482 0.100496 100.2007 127.9745 232.5875 298.4974 353.231 431.1215 522.3257 570.9286 621.3997 672.0319 784.2594 795.5337 865.9533 885.323 893.4322 904.9354 915.2204 981.0394 1016.9927 1019.7509 1034.4039 1063.3306 1083.6732 1090.5151 1118.5935 1175.6266 1189.894 1194.1127 1204.5343 1230.1567 1240.0228 1265.4029 1351.9632 1478.6117 1495.3028 1539.012 2358.3173 3063.1492 3064.9616 3110.5825 3132.5521 3134.8419 3141.495 3144.4329 3488.3389 N#CC1NC11C2CC1C2 N#C[C@H]1N[C@]21[C@H]1C[C@@H]2C1 InChI=1S/C7H8N2/c8-3-6-7(9-6)4-1-5(7)2-4/h4-6,9H,1-2H2 InChI=1S/C7H8N2/c8-3-6-7(9-6)4-1-5(7)2-4/h4-6,9H,1-2H2/t4-,5+,6-,7-/m1/s1
Out[4]:
InĀ [5]:
# Reading the entire dataset
def read_xyz(path):
"""
Reads the xyz files in the directory on 'path'
Input
path: the path to the folder to be read
Output
atoms: list with the characters representing the atoms of a molecule
coordinates: list with the cartesian coordinates of each atom
smile: list with the SMILE representation of a molecule
prop: list with the scalar properties
"""
atoms = []
coordinates = []
with open(path, 'r') as f:
lines = f.readlines()
num_atoms = int(lines[0])
smile = lines[num_atoms + 3].split()[0]
prop = lines[1].split()[2:] # The first two elements are the data set name and the number of index
for i in lines[2:num_atoms + 2]:
i = i.split()
atom = i[0]
atoms.append(atom) # Atoms are stored as a list of strings
try:
x = float(i[1])
y = float(i[2])
z = float(i[3])
coordinates.append((x, y, z)) # Coordinates are stored as a tuple of lists
except: # Some coordinates are include exponent notation
x = float(i[1].replace('*^', 'e'))
y = float(i[2].replace('*^', 'e'))
z = float(i[3].replace('*^', 'e'))
coordinates.append((x, y, z))
return atoms, coordinates, smile, prop
InĀ [6]:
# Read the entire dataset and return atoms, coordinates, smiles and properties
data = []
smiles = []
properties = []
for file in os.listdir(directory):
atoms, coordinates, smile, prop = read_xyz(os.path.join(directory, file))
data.append([atoms, coordinates])
smiles.append(smile)
properties.append(prop)
print ("Number of molecules in the dataset: ", len(data))
print ("Number of properties in the dataset: ", len(properties))
print ("Number of smiles in the dataset: ", len(smiles))
Number of molecules in the dataset: 133885 Number of properties in the dataset: 133885 Number of smiles in the dataset: 133885
InĀ [7]:
# Saving data list for use in other modules
import csv
with open('cm_eigen.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
InĀ [8]:
# Properties and Data will be stored in a single dataframe
property_names = ['A', 'B', 'C', 'mu', 'alfa', 'homo', 'lumo', 'gap', 'R²', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
property_df = pd.DataFrame(properties, columns = property_names).astype("float32")
smiles_df = pd.DataFrame(smiles, columns = ["smiles"])
df = pd.concat([property_df,smiles_df],axis =1) # Using pd.concat() combines everything at once and avoids unnecessary memory copying
# Read each molecule by its SMILES string
valid_mols = []
for smi in smiles:
mol = Chem.MolFromSmiles(smi)
valid_mols.append(mol)
mol_df = pd.DataFrame(valid_mols, columns = ['mol'])
# Concat to main df and check if there are any Null values
df = pd.concat([df, mol_df], axis=1)
df['mol'].isnull().sum()
Out[8]:
0
InĀ [9]:
# Reads and prints the 20 first molecules
mol = df['mol'][:20]
Draw.MolsToGridImage(mol, molsPerRow=5, useSVG=True, legends=list(df['smiles'][:20].values))
# Draws multiple RDKit mol object and display in grid like formation
# useSVG enables vector graphics instead of pixel graphics, resulting in a better quality image
Out[9]:
InĀ [10]:
# SMILES representation does not contain hydrogen atoms
# Need to add hydrogen count to the RDKit mol object
df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))
# Make a seperate column for number of atoms and number of heavy atoms
df['num_atoms'] = df['mol'].apply(lambda x: x.GetNumAtoms())
df['num_heavy_atoms'] = df['mol'].apply(lambda x: x.GetNumHeavyAtoms())
print(df['num_atoms'][0])
print(df['num_heavy_atoms'][0])
Draw.MolToImage(df['mol'][0]) # Draws a single RDKit mol object
17 9
Out[10]:
InĀ [11]:
# Creating canonical SMILE representation is essential for normalization of the representation
# Smiles representation to mol object and back to smiles representation
def canonicalize_smiles(smiles):
return Chem.MolToSmiles(Chem.MolFromSmiles(mol), isomericSmiles=True, canonical=True)
# isomericSmiles is used to keep the stereochemistry of the molecule; Chirality and double bonds
# canonical is used to keep the canonical SMILES representation
canonical_smile = []
for mol in smiles:
canonical_smile.append(canonicalize_smiles(mol))
df['canonical_smile'] = canonical_smile
InĀ [12]:
# Check for duplicates
df['canonical_smile'][df['canonical_smile'].duplicated()]
Out[12]:
17903 NC=Nc1cnccn1 24934 Cc1cnc(F)cn1 27909 CN(C)c1cnccn1 28007 O=[N+]([O-])c1cnccn1 30292 c1cnc(N2CC2)cn1 ... 131653 CC(C)c1cnccn1 132483 CNc1cnc(F)cn1 133438 C#Cc1cnc(O)cn1 133655 N#Cc1cnc(O)cn1 133850 C#Cc1cnc(N)cn1 Name: canonical_smile, Length: 87, dtype: object
InĀ [13]:
# Investigate the duplicates
df[df['canonical_smile'] == "Cc1cnc(F)cn1" ]
Out[13]:
A | B | C | mu | alfa | homo | lumo | gap | R² | zpve | U0 | U | H | G | Cv | smiles | mol | num_atoms | num_heavy_atoms | canonical_smile | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
8282 | 5.89515 | 1.52518 | 1.22084 | 1.7943 | 62.959999 | -0.2623 | -0.0557 | 0.2066 | 911.562073 | 0.096076 | -402.805389 | -402.798767 | -402.797821 | -402.836609 | 24.076 | CC1=CN=C(F)C=N1 | <rdkit.Chem.rdchem.Mol object at 0x36a717f40> | 13 | 8 | Cc1cnc(F)cn1 |
24934 | 5.89515 | 1.52518 | 1.22084 | 1.7943 | 62.959999 | -0.2623 | -0.0557 | 0.2066 | 911.563477 | 0.096076 | -402.805389 | -402.798767 | -402.797821 | -402.836609 | 24.076 | CC1=NC=C(F)N=C1 | <rdkit.Chem.rdchem.Mol object at 0x36f4e3530> | 13 | 8 | Cc1cnc(F)cn1 |
InĀ [14]:
# index 8282
Chem.MolFromSmiles("CC1=CN=C(F)C=N1")
Out[14]:
InĀ [15]:
# index = 24934
Chem.MolFromSmiles("CC1=NC=C(F)N=C1")
Out[15]:
InĀ [16]:
# Molecules are same, so will drop duplicates
df = df.drop_duplicates(subset='canonical_smile', keep='first')
# Reset the index
df = df.reset_index(drop=True)
# Check for duplicates again
df['canonical_smile'][df['canonical_smile'].duplicated()]
Out[16]:
Series([], Name: canonical_smile, dtype: object)
InĀ [17]:
# RDKit objects will be regenerated utilizing the canonical SMILES representation
df['mol'] = df['canonical_smile'].apply(lambda x: Chem.MolFromSmiles(x))
InĀ [18]:
# Property values will be investigated
# Check for outliers first
plt.figure(figsize=(15, 10))
plot_counter = 1
for prop in df.iloc[:,:15].columns:
plt.subplot(5, 3, plot_counter)
plt.hist(df.iloc[:,:15][prop], edgecolor='k')
plt.title(prop)
plot_counter += 1
plt.tight_layout()
plt.show()
InĀ [19]:
# Check statistics of the properties
df.describe()
Out[19]:
A | B | C | mu | alfa | homo | lumo | gap | R² | zpve | U0 | U | H | G | Cv | num_atoms | num_heavy_atoms | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 | 133798.000000 |
mean | 9.817750 | 1.406112 | 1.124983 | 2.706031 | 75.192825 | -0.239977 | 0.011164 | 0.251141 | 1189.569214 | 0.148550 | -411.537964 | -411.529449 | -411.528473 | -411.571320 | 31.603436 | 17.986069 | 8.795886 |
std | 1810.053833 | 1.584265 | 1.095953 | 1.530345 | 8.187228 | 0.022133 | 0.046923 | 0.047507 | 279.796112 | 0.033264 | 40.056843 | 40.056622 | 40.056622 | 40.057354 | 4.061663 | 2.953332 | 0.508952 |
min | 0.000000 | 0.337120 | 0.331180 | 0.000000 | 6.310000 | -0.428600 | -0.175000 | 0.024600 | 19.000200 | 0.015951 | -714.568054 | -714.560181 | -714.559204 | -714.602112 | 6.002000 | 3.000000 | 1.000000 |
25% | 2.554310 | 1.091790 | 0.910563 | 1.588800 | 70.379997 | -0.252500 | -0.023800 | 0.216300 | 1018.340546 | 0.125308 | -437.913811 | -437.905693 | -437.904755 | -437.947525 | 28.945000 | 16.000000 | 9.000000 |
50% | 3.089935 | 1.370030 | 1.078720 | 2.500050 | 75.500000 | -0.241000 | 0.012000 | 0.249500 | 1147.573181 | 0.148341 | -417.864075 | -417.856445 | -417.855499 | -417.895630 | 31.556999 | 18.000000 | 9.000000 |
75% | 3.834450 | 1.653957 | 1.279640 | 3.635900 | 80.519997 | -0.228700 | 0.049200 | 0.288200 | 1308.888245 | 0.171158 | -387.049057 | -387.039703 | -387.038757 | -387.083221 | 34.278000 | 20.000000 | 9.000000 |
max | 619867.687500 | 437.903870 | 282.945465 | 29.556400 | 196.619995 | -0.101700 | 0.193500 | 0.622100 | 3374.753174 | 0.273944 | -40.478931 | -40.476063 | -40.475117 | -40.498596 | 46.969002 | 29.000000 | 9.000000 |
InĀ [20]:
# For A, B and C; Max values are much larger than the mean indicating the presence of outliers
# Check for outliers in A, B and C
df[df['A']>100]
Out[20]:
A | B | C | mu | alfa | homo | lumo | gap | R² | zpve | U0 | U | H | G | Cv | smiles | mol | num_atoms | num_heavy_atoms | canonical_smile | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18410 | 159.871170 | 0.379330 | 0.379330 | 1.9087 | 143.529999 | -0.2266 | -0.0676 | 0.1589 | 2708.153809 | 0.085798 | -345.074463 | -345.064758 | -345.063812 | -345.108856 | 33.347000 | CC#CC#CC#CC#C | <rdkit.Chem.rdchem.Mol object at 0x3b1e1d460> | 13 | 9 | C#CC#CC#CC#CC |
20529 | 159.619873 | 0.782570 | 0.782570 | 6.3203 | 82.739998 | -0.2665 | -0.0718 | 0.1947 | 1344.432739 | 0.065892 | -285.016357 | -285.009094 | -285.008148 | -285.047272 | 23.785999 | CC#CC#CC#N | <rdkit.Chem.rdchem.Mol object at 0x3b1e57b50> | 10 | 7 | CC#CC#CC#N |
21520 | 159.934586 | 0.783440 | 0.783440 | 1.5732 | 92.339996 | -0.2323 | -0.0452 | 0.1871 | 1373.624756 | 0.075867 | -268.917908 | -268.910156 | -268.909241 | -268.949005 | 26.267000 | CC#CC#CC#C | <rdkit.Chem.rdchem.Mol object at 0x3b1e730d0> | 11 | 7 | C#CC#CC#CC |
23641 | 159.655106 | 0.377100 | 0.377100 | 7.3662 | 130.860001 | -0.2540 | -0.0911 | 0.1628 | 2674.166504 | 0.076809 | -361.170929 | -361.161896 | -361.160950 | -361.205109 | 29.965000 | CC#CC#CC#CC#N | <rdkit.Chem.rdchem.Mol object at 0x3b1ead930> | 12 | 9 | CC#CC#CC#CC#N |
26441 | 619867.687500 | 1.334550 | 1.334540 | 0.0151 | 72.389999 | -0.2465 | -0.0565 | 0.1900 | 802.148499 | 0.046718 | -229.615448 | -229.609222 | -229.608276 | -229.639954 | 21.875000 | C#CC#CC#C | <rdkit.Chem.rdchem.Mol object at 0x3b1eface0> | 8 | 6 | C#CC#CC#C |
62592 | 127.834969 | 24.858721 | 23.978720 | 1.5258 | 16.969999 | -0.2653 | 0.0784 | 0.3437 | 83.793999 | 0.051208 | -115.679138 | -115.675819 | -115.674873 | -115.701874 | 8.751000 | CO | <rdkit.Chem.rdchem.Mol object at 0x3c1a93610> | 6 | 2 | CO |
62794 | 160.280411 | 8.593230 | 8.593210 | 0.7156 | 28.780001 | -0.2609 | 0.0613 | 0.3222 | 177.196304 | 0.055410 | -116.609550 | -116.605553 | -116.604607 | -116.633774 | 12.482000 | CC#C | <rdkit.Chem.rdchem.Mol object at 0x3c1a98f90> | 7 | 3 | C#CC |
71031 | 159.035675 | 9.223270 | 9.223240 | 3.8266 | 24.450001 | -0.3264 | 0.0376 | 0.3640 | 160.722305 | 0.045286 | -132.718155 | -132.714569 | -132.713623 | -132.742142 | 10.287000 | CC#N | <rdkit.Chem.rdchem.Mol object at 0x3c557c350> | 6 | 3 | CC#N |
79545 | 285.488403 | 38.982300 | 34.298920 | 2.1089 | 14.180000 | -0.2670 | -0.0406 | 0.2263 | 59.989101 | 0.026603 | -114.483612 | -114.480743 | -114.479805 | -114.505264 | 6.413000 | C=O | <rdkit.Chem.rdchem.Mol object at 0x3c8f670d0> | 4 | 2 | C=O |
87430 | 293.609741 | 293.541107 | 191.393967 | 1.6256 | 9.460000 | -0.2570 | 0.0829 | 0.3399 | 26.156300 | 0.034358 | -56.525887 | -56.523026 | -56.522083 | -56.544960 | 6.316000 | N | <rdkit.Chem.rdchem.Mol object at 0x3cca40970> | 4 | 1 | N |
88128 | 799.588135 | 437.903870 | 282.945465 | 1.8511 | 6.310000 | -0.2928 | 0.0687 | 0.3615 | 19.000200 | 0.021375 | -76.404701 | -76.401871 | -76.400925 | -76.422348 | 6.002000 | O | <rdkit.Chem.rdchem.Mol object at 0x3cca53d10> | 3 | 1 | O |
95570 | 157.711807 | 157.709976 | 157.706985 | 0.0000 | 13.210000 | -0.3877 | 0.1171 | 0.5048 | 35.364101 | 0.044749 | -40.478931 | -40.476063 | -40.475117 | -40.498596 | 6.469000 | C | <rdkit.Chem.rdchem.Mol object at 0x3d0621230> | 5 | 1 | C |
103389 | 232663.781250 | 0.570310 | 0.570310 | 0.0096 | 119.419998 | -0.2383 | -0.0774 | 0.1609 | 1770.345459 | 0.058203 | -305.770203 | -305.762421 | -305.761475 | -305.797943 | 27.635000 | C#CC#CC#CC#C | <rdkit.Chem.rdchem.Mol object at 0x3d06f8d60> | 10 | 8 | C#CC#CC#CC#C |
117751 | 160.026535 | 2.048960 | 2.048960 | 1.1881 | 54.540001 | -0.2420 | -0.0085 | 0.2335 | 576.593628 | 0.065688 | -192.762451 | -192.756653 | -192.755707 | -192.790146 | 19.382000 | CC#CC#C | <rdkit.Chem.rdchem.Mol object at 0x3d7b85070> | 9 | 5 | C#CC#CC |
126160 | 159.561462 | 2.076230 | 2.076230 | 5.1545 | 47.820000 | -0.2871 | -0.0377 | 0.2494 | 553.209412 | 0.055566 | -208.863266 | -208.857880 | -208.856934 | -208.890808 | 17.054001 | CC#CC#N | <rdkit.Chem.rdchem.Mol object at 0x3db66d000> | 8 | 5 | CC#CC#N |
InĀ [21]:
# Will be dropping the index 26441 and 103389 due to extensive outliers
df = df.drop([26441, 103389])
# Reset the index
df = df.reset_index(drop=True)
# Outliers in B and C wasn't as extensive as A so will not be dropping them
df[df['B']>100]
df[df['C']>100]
Out[21]:
A | B | C | mu | alfa | homo | lumo | gap | R² | zpve | U0 | U | H | G | Cv | smiles | mol | num_atoms | num_heavy_atoms | canonical_smile | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
87429 | 293.609741 | 293.541107 | 191.393967 | 1.6256 | 9.46 | -0.2570 | 0.0829 | 0.3399 | 26.156300 | 0.034358 | -56.525887 | -56.523026 | -56.522083 | -56.544960 | 6.316 | N | <rdkit.Chem.rdchem.Mol object at 0x3cca40970> | 4 | 1 | N |
88127 | 799.588135 | 437.903870 | 282.945465 | 1.8511 | 6.31 | -0.2928 | 0.0687 | 0.3615 | 19.000200 | 0.021375 | -76.404701 | -76.401871 | -76.400925 | -76.422348 | 6.002 | O | <rdkit.Chem.rdchem.Mol object at 0x3cca53d10> | 3 | 1 | O |
95569 | 157.711807 | 157.709976 | 157.706985 | 0.0000 | 13.21 | -0.3877 | 0.1171 | 0.5048 | 35.364101 | 0.044749 | -40.478931 | -40.476063 | -40.475117 | -40.498596 | 6.469 | C | <rdkit.Chem.rdchem.Mol object at 0x3d0621230> | 5 | 1 | C |
InĀ [22]:
# Check plots again
plt.figure(figsize=(15, 10))
plot_counter = 1
for prop in df.iloc[:,:15].columns:
plt.subplot(5, 3, plot_counter)
plt.hist(df.iloc[:,:15][prop], edgecolor='k')
plt.title(prop)
plot_counter += 1
plt.tight_layout()
plt.show()
# A outliers were reduced
InĀ [23]:
# Save dataframe to csv
df.to_csv('processed_data.csv', index=False)