InĀ [10]:
# Import library
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
final_data = pd.read_csv('final_data.csv',index_col=0)
InĀ [11]:
# Examine final_data df
print(final_data.describe())
         resolution  structureMolecularWeight  crystallizationTempK  \
count  33771.000000              3.377100e+04          33771.000000   
mean       2.219473              1.007227e+05            291.201611   
std        0.596595              1.539324e+05              8.961340   
min        0.680000              8.368100e+02              4.000000   
25%        1.800000              3.492054e+04            291.000000   
50%        2.100000              5.915556e+04            293.000000   
75%        2.570000              1.067060e+05            295.000000   
max        9.010000              4.921404e+06            398.000000   

       densityMatthews  densityPercentSol       phValue  residueCount  
count     33771.000000       33771.000000  33771.000000  33771.000000  
mean          2.740118          52.439039      6.796882    889.246809  
std           0.761843          10.185204      1.368236   1356.177297  
min           0.000000           0.000000      0.000000      7.000000  
25%           2.250000          45.300000      6.000000    307.000000  
50%           2.540000          51.630000      7.000000    522.000000  
75%           3.000000          58.940000      7.500000    945.000000  
max          12.700000          90.330000    100.000000  57792.000000  
InĀ [12]:
# Check for correlation between columns
plt.figure(figsize=(5, 5))
sns.heatmap(final_data.corr(numeric_only=True), annot=True, fmt='.2f')
plt.title(f'Correlation Matrix')
plt.xticks(rotation = 45, ha = 'right')

plt.show()
No description has been provided for this image

Seems like structureMolecularWeight and residueCount has high correlation and can use either

InĀ [13]:
# Check for outliers in the features
for column in final_data.select_dtypes(include='number').columns:
    plt.figure(figsize=(5, 5))
    sns.histplot(x=column, data = final_data, bins = 40)
    plt.title(f'{column}')
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [14]:
# Checking the high and low values of each column
for column in final_data.select_dtypes(include='number').columns:
    print({column})
    print(final_data[column].sort_values(ascending=False))
    print()
{'resolution'}
28911    9.01
10855    9.00
20108    8.49
20109    8.49
24817    8.00
         ... 
20163    0.78
26364    0.74
28097    0.70
6744     0.69
8011     0.68
Name: resolution, Length: 33771, dtype: float64

{'structureMolecularWeight'}
33742    4921404.50
15439    4918807.00
10855    4652705.00
27786    3762007.50
27783    3762007.50
            ...    
29432       3292.05
3892        2441.58
27243       2138.67
30397       2120.47
6739         836.81
Name: structureMolecularWeight, Length: 33771, dtype: float64

{'crystallizationTempK'}
3659    398.0
4096    335.0
2271    334.0
9305    333.0
9306    333.0
        ...  
6056     20.0
930       4.0
484       4.0
659       4.0
4506      4.0
Name: crystallizationTempK, Length: 33771, dtype: float64

{'densityMatthews'}
31467    12.70
21518    11.12
28911    10.30
6809     10.00
6810     10.00
         ...  
5430      1.00
222       1.00
69        1.00
1823      1.00
28235     0.00
Name: densityMatthews, Length: 33771, dtype: float64

{'densityPercentSol'}
31467    90.33
21518    88.94
28911    88.00
17440    87.63
11631    87.28
         ...  
6243      0.54
5750      0.54
5755      0.52
5756      0.52
28235     0.00
Name: densityPercentSol, Length: 33771, dtype: float64

{'phValue'}
26267    100.0
6802      11.0
27414     11.0
6800      11.0
17426     11.0
         ...  
14419      0.0
16803      0.0
14418      0.0
16801      0.0
29777      0.0
Name: phValue, Length: 33771, dtype: float64

{'residueCount'}
15439    57792
33742    44480
27778    33372
27785    33372
27784    33372
         ...  
26013       27
3892        20
30397       20
27243       16
6739         7
Name: residueCount, Length: 33771, dtype: int64

InĀ [15]:
# Drop data that pH is over 14 or 0
final_data = final_data[(final_data['phValue'] <= 14) & final_data['phValue'] != 0]

# drop index 29256
final_data = final_data.drop(final_data.index[29256])

# Examine dataframe again
final_data.describe()
Out[15]:
resolution structureMolecularWeight crystallizationTempK densityMatthews densityPercentSol phValue residueCount
count 33763.000000 3.376300e+04 33763.000000 33763.000000 33763.000000 33763.000000 33763.000000
mean 2.219488 1.007273e+05 291.201037 2.740131 52.438934 6.795279 889.287593
std 0.596651 1.539484e+05 8.962288 0.761902 10.185734 1.267643 1356.316126
min 0.680000 8.368100e+02 4.000000 0.000000 0.000000 2.000000 7.000000
25% 1.800000 3.491871e+04 291.000000 2.250000 45.300000 6.000000 307.000000
50% 2.100000 5.915089e+04 293.000000 2.540000 51.630000 7.000000 522.000000
75% 2.570000 1.067060e+05 295.000000 3.000000 58.940000 7.500000 945.000000
max 9.010000 4.921404e+06 398.000000 12.700000 90.330000 11.000000 57792.000000
InĀ [16]:
# Check figures again
for column in final_data.select_dtypes(include='number').columns:
    plt.figure(figsize=(3, 3))
    sns.histplot(x=column, data = final_data, bins = 40)
    plt.title(f'{column}')
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
InĀ [17]:
# Save dataframe for ML/DL
final_data.to_csv('ml_data.csv')