# Import library
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load data
final_data = pd.read_csv('final_data.csv',index_col=0)

# Examine final_data df
print(final_data.describe())

         resolution  structureMolecularWeight  crystallizationTempK  \
count  33771.000000              3.377100e+04          33771.000000   
mean       2.219473              1.007227e+05            291.201611   
std        0.596595              1.539324e+05              8.961340   
min        0.680000              8.368100e+02              4.000000   
25%        1.800000              3.492054e+04            291.000000   
50%        2.100000              5.915556e+04            293.000000   
75%        2.570000              1.067060e+05            295.000000   
max        9.010000              4.921404e+06            398.000000   

       densityMatthews  densityPercentSol       phValue  residueCount  
count     33771.000000       33771.000000  33771.000000  33771.000000  
mean          2.740118          52.439039      6.796882    889.246809  
std           0.761843          10.185204      1.368236   1356.177297  
min           0.000000           0.000000      0.000000      7.000000  
25%           2.250000          45.300000      6.000000    307.000000  
50%           2.540000          51.630000      7.000000    522.000000  
75%           3.000000          58.940000      7.500000    945.000000  
max          12.700000          90.330000    100.000000  57792.000000

# Check for correlation between columns
plt.figure(figsize=(5, 5))
sns.heatmap(final_data.corr(numeric_only=True), annot=True, fmt='.2f')
plt.title(f'Correlation Matrix')
plt.xticks(rotation = 45, ha = 'right')

plt.show()

# Check for outliers in the features
for column in final_data.select_dtypes(include='number').columns:
    plt.figure(figsize=(5, 5))
    sns.histplot(x=column, data = final_data, bins = 40)
    plt.title(f'{column}')
    plt.tight_layout()
    plt.show()

# Checking the high and low values of each column
for column in final_data.select_dtypes(include='number').columns:
    print({column})
    print(final_data[column].sort_values(ascending=False))
    print()

{'resolution'}
28911    9.01
10855    9.00
20108    8.49
20109    8.49
24817    8.00
         ... 
20163    0.78
26364    0.74
28097    0.70
6744     0.69
8011     0.68
Name: resolution, Length: 33771, dtype: float64

{'structureMolecularWeight'}
33742    4921404.50
15439    4918807.00
10855    4652705.00
27786    3762007.50
27783    3762007.50
            ...    
29432       3292.05
3892        2441.58
27243       2138.67
30397       2120.47
6739         836.81
Name: structureMolecularWeight, Length: 33771, dtype: float64

{'crystallizationTempK'}
3659    398.0
4096    335.0
2271    334.0
9305    333.0
9306    333.0
        ...  
6056     20.0
930       4.0
484       4.0
659       4.0
4506      4.0
Name: crystallizationTempK, Length: 33771, dtype: float64

{'densityMatthews'}
31467    12.70
21518    11.12
28911    10.30
6809     10.00
6810     10.00
         ...  
5430      1.00
222       1.00
69        1.00
1823      1.00
28235     0.00
Name: densityMatthews, Length: 33771, dtype: float64

{'densityPercentSol'}
31467    90.33
21518    88.94
28911    88.00
17440    87.63
11631    87.28
         ...  
6243      0.54
5750      0.54
5755      0.52
5756      0.52
28235     0.00
Name: densityPercentSol, Length: 33771, dtype: float64

{'phValue'}
26267    100.0
6802      11.0
27414     11.0
6800      11.0
17426     11.0
         ...  
14419      0.0
16803      0.0
14418      0.0
16801      0.0
29777      0.0
Name: phValue, Length: 33771, dtype: float64

{'residueCount'}
15439    57792
33742    44480
27778    33372
27785    33372
27784    33372
         ...  
26013       27
3892        20
30397       20
27243       16
6739         7
Name: residueCount, Length: 33771, dtype: int64

# Drop data that pH is over 14 or 0
final_data = final_data[(final_data['phValue'] <= 14) & final_data['phValue'] != 0]

# drop index 29256
final_data = final_data.drop(final_data.index[29256])

# Examine dataframe again
final_data.describe()

# Check figures again
for column in final_data.select_dtypes(include='number').columns:
    plt.figure(figsize=(3, 3))
    sns.histplot(x=column, data = final_data, bins = 40)
    plt.title(f'{column}')
    plt.tight_layout()
    plt.show()

# Save dataframe for ML/DL
final_data.to_csv('ml_data.csv')

	resolution	structureMolecularWeight	crystallizationTempK	densityMatthews	densityPercentSol	phValue	residueCount
count	33763.000000	3.376300e+04	33763.000000	33763.000000	33763.000000	33763.000000	33763.000000
mean	2.219488	1.007273e+05	291.201037	2.740131	52.438934	6.795279	889.287593
std	0.596651	1.539484e+05	8.962288	0.761902	10.185734	1.267643	1356.316126
min	0.680000	8.368100e+02	4.000000	0.000000	0.000000	2.000000	7.000000
25%	1.800000	3.491871e+04	291.000000	2.250000	45.300000	6.000000	307.000000
50%	2.100000	5.915089e+04	293.000000	2.540000	51.630000	7.000000	522.000000
75%	2.570000	1.067060e+05	295.000000	3.000000	58.940000	7.500000	945.000000
max	9.010000	4.921404e+06	398.000000	12.700000	90.330000	11.000000	57792.000000