# Import library
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Import Datasets
df_seq = pd.read_csv('data/pdb_data_seq.csv')
df_char = pd.read_csv('data/pdb_data_no_dups.csv')

# Look through df_seq DataFrame
print(df_seq.info())
print()
print(df_seq.isnull().sum())
print()
print(df_seq['macromoleculeType'].value_counts(dropna=False))

# Filter for protein sequences only
protein_seq = df_seq[df_seq['macromoleculeType'] == 'Protein']
print(protein_seq)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467304 entries, 0 to 467303
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   structureId        467304 non-null  object
 1   chainId            467294 non-null  object
 2   sequence           467276 non-null  object
 3   residueCount       467304 non-null  int64 
 4   macromoleculeType  432487 non-null  object
dtypes: int64(1), object(4)
memory usage: 17.8+ MB
None

structureId              0
chainId                 10
sequence                28
residueCount             0
macromoleculeType    34817
dtype: int64

macromoleculeType
Protein                       345180
Protein#RNA                    56226
NaN                            34817
Protein#DNA                    21303
DNA                             3784
Protein#DNA#RNA                 2712
RNA                             2389
Protein#RNA#DNA/RNA Hybrid       304
Protein#DNA#DNA/RNA Hybrid       159
DNA/RNA Hybrid                   141
DNA#RNA                          121
RNA#DNA/RNA Hybrid                74
Protein#DNA/RNA Hybrid            68
DNA#DNA/RNA Hybrid                26
Name: count, dtype: int64
       structureId chainId                                           sequence  \
4             101M       A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
7             102L       A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...   
8             102M       A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
11            103L       A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...   
12            103M       A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
...            ...     ...                                                ...   
467299        9XIA       A  MNYQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVESVQRLAEL...   
467300        9XIM       A  SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG...   
467301        9XIM       B  SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG...   
467302        9XIM       C  SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG...   
467303        9XIM       D  SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG...   

        residueCount macromoleculeType  
4                154           Protein  
7                165           Protein  
8                154           Protein  
11               167           Protein  
12               154           Protein  
...              ...               ...  
467299           388           Protein  
467300          1572           Protein  
467301          1572           Protein  
467302          1572           Protein  
467303          1572           Protein  

[345180 rows x 5 columns]

# Examine df_char DataFrame
print(df_char.info())
print()
print(df_char.isnull().sum())
print()
print(df_char['macromoleculeType'].value_counts(dropna = False))

# Filter to macromolecule type = protein
protein_char = df_char[df_char['macromoleculeType'] == 'Protein']
print(protein_char)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141401 entries, 0 to 141400
Data columns (total 14 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   structureId               141401 non-null  object 
 1   classification            141399 non-null  object 
 2   experimentalTechnique     141401 non-null  object 
 3   macromoleculeType         137636 non-null  object 
 4   residueCount              141401 non-null  int64  
 5   resolution                128589 non-null  float64
 6   structureMolecularWeight  141401 non-null  float64
 7   crystallizationMethod     96242 non-null   object 
 8   crystallizationTempK      97039 non-null   float64
 9   densityMatthews           124724 non-null  float64
 10  densityPercentSol         124749 non-null  float64
 11  pdbxDetails               118534 non-null  object 
 12  phValue                   105110 non-null  float64
 13  publicationYear           117602 non-null  float64
dtypes: float64(7), int64(1), object(6)
memory usage: 15.1+ MB
None

structureId                     0
classification                  2
experimentalTechnique           0
macromoleculeType            3765
residueCount                    0
resolution                  12812
structureMolecularWeight        0
crystallizationMethod       45159
crystallizationTempK        44362
densityMatthews             16677
densityPercentSol           16652
pdbxDetails                 22867
phValue                     36291
publicationYear             23799
dtype: int64

macromoleculeType
Protein                       127798
Protein#DNA                     4176
NaN                             3765
Protein#RNA                     2162
DNA                             1744
RNA                             1295
Protein#DNA#RNA                  250
DNA/RNA Hybrid                    58
DNA#RNA                           51
Protein#DNA#DNA/RNA Hybrid        34
RNA#DNA/RNA Hybrid                27
Protein#DNA/RNA Hybrid            19
DNA#DNA/RNA Hybrid                13
Protein#RNA#DNA/RNA Hybrid         9
Name: count, dtype: int64
       structureId                            classification  \
2             101M                          OXYGEN TRANSPORT   
4             102L                     HYDROLASE(O-GLYCOSYL)   
5             102M                          OXYGEN TRANSPORT   
7             103L                     HYDROLASE(O-GLYCOSYL)   
8             103M                          OXYGEN TRANSPORT   
...            ...                                       ...   
141395        9RSA            HYDROLASE (PHOSPHORIC DIESTER)   
141396        9RUB                      LYASE(CARBON-CARBON)   
141398        9WGA                       LECTIN (AGGLUTININ)   
141399        9XIA  ISOMERASE(INTRAMOLECULAR OXIDOREDUCTASE)   
141400        9XIM  ISOMERASE(INTRAMOLECULAR OXIDOREDUCTASE)   

       experimentalTechnique macromoleculeType  residueCount  resolution  \
2          X-RAY DIFFRACTION           Protein           154        2.07   
4          X-RAY DIFFRACTION           Protein           165        1.74   
5          X-RAY DIFFRACTION           Protein           154        1.84   
7          X-RAY DIFFRACTION           Protein           167        1.90   
8          X-RAY DIFFRACTION           Protein           154        2.07   
...                      ...               ...           ...         ...   
141395     X-RAY DIFFRACTION           Protein           248        1.80   
141396     X-RAY DIFFRACTION           Protein           932        2.60   
141398     X-RAY DIFFRACTION           Protein           342        1.80   
141399     X-RAY DIFFRACTION           Protein           388        1.90   
141400     X-RAY DIFFRACTION           Protein          1572        2.40   

        structureMolecularWeight crystallizationMethod  crystallizationTempK  \
2                       18112.80                   NaN                   NaN   
4                       18926.61                   NaN                   NaN   
5                       18010.64                   NaN                   NaN   
7                       19092.72                   NaN                   NaN   
8                       18093.78                   NaN                   NaN   
...                          ...                   ...                   ...   
141395                  27987.16                   NaN                   NaN   
141396                 101838.68                   NaN                   NaN   
141398                  34270.22                   NaN                   NaN   
141399                  43542.29                   NaN                   NaN   
141400                 174722.12                   NaN                   NaN   

        densityMatthews  densityPercentSol  \
2                  3.09              60.20   
4                  2.75              55.28   
5                  3.09              60.20   
7                  2.70              54.46   
8                  3.09              60.30   
...                 ...                ...   
141395             2.25              45.45   
141396             2.38              48.29   
141398             2.50              50.76   
141399             2.79              55.93   
141400             3.96              68.92   

                                              pdbxDetails  phValue  \
2       3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...      9.0   
4                                                     NaN      NaN   
5       3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...      9.0   
7                                                     NaN      NaN   
8       3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...      9.0   
...                                                   ...      ...   
141395                                                NaN      NaN   
141396                                                NaN      NaN   
141398                                                NaN      NaN   
141399                                                NaN      NaN   
141400                                                NaN      NaN   

        publicationYear  
2                1999.0  
4                1993.0  
5                1999.0  
7                1993.0  
8                1999.0  
...                 ...  
141395           1990.0  
141396           1991.0  
141398           1990.0  
141399           1989.0  
141400           1992.0  

[127798 rows x 14 columns]

# Inner join two DataFrames by structureID
joint_df = pd.merge(protein_seq, protein_char, on = 'structureId')

# Examine joined df for missing values
print(joint_df.info())

# Drop missing values
joint_df = joint_df.dropna().reset_index(drop = True)

print(joint_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 346325 entries, 0 to 346324
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   structureId               346325 non-null  object 
 1   chainId                   346323 non-null  object 
 2   sequence                  346322 non-null  object 
 3   residueCount_x            346325 non-null  int64  
 4   macromoleculeType_x       346325 non-null  object 
 5   classification            346324 non-null  object 
 6   experimentalTechnique     346325 non-null  object 
 7   macromoleculeType_y       346325 non-null  object 
 8   residueCount_y            346325 non-null  int64  
 9   resolution                330232 non-null  float64
 10  structureMolecularWeight  346325 non-null  float64
 11  crystallizationMethod     240597 non-null  object 
 12  crystallizationTempK      243934 non-null  float64
 13  densityMatthews           307481 non-null  float64
 14  densityPercentSol         307615 non-null  float64
 15  pdbxDetails               294697 non-null  object 
 16  phValue                   259130 non-null  float64
 17  publicationYear           295363 non-null  float64
dtypes: float64(7), int64(2), object(9)
memory usage: 47.6+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170154 entries, 0 to 170153
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   structureId               170154 non-null  object 
 1   chainId                   170154 non-null  object 
 2   sequence                  170154 non-null  object 
 3   residueCount_x            170154 non-null  int64  
 4   macromoleculeType_x       170154 non-null  object 
 5   classification            170154 non-null  object 
 6   experimentalTechnique     170154 non-null  object 
 7   macromoleculeType_y       170154 non-null  object 
 8   residueCount_y            170154 non-null  int64  
 9   resolution                170154 non-null  float64
 10  structureMolecularWeight  170154 non-null  float64
 11  crystallizationMethod     170154 non-null  object 
 12  crystallizationTempK      170154 non-null  float64
 13  densityMatthews           170154 non-null  float64
 14  densityPercentSol         170154 non-null  float64
 15  pdbxDetails               170154 non-null  object 
 16  phValue                   170154 non-null  float64
 17  publicationYear           170154 non-null  float64
dtypes: float64(7), int64(2), object(9)
memory usage: 23.4+ MB
None

# Overview of joined dataframe
print(joint_df)
print(joint_df.columns)

       structureId chainId                                           sequence  \
0             1914       A  MASMTGGQQMGRIPGNSPRMVLLESEQFLTELTRLFQKCRSSGSVF...   
1             1A04       A  SNQEPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIEL...   
2             1A04       B  SNQEPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIEL...   
3             1A0R       B  XSELDQLRQEAEQLKNQIRDARKACADATLSQITNNIDPVGRIQMR...   
4             1A0R       G  PVINIEDLTEKDKLKMEVDQLKKEVTLERMLVSKCCEEFRDYVEER...   
...            ...     ...                                                ...   
170149        6FAH       B  MRILVCAKQVPDTNEVKIDPKTGTMIREGVPSILNPDDANALEAAL...   
170150        6FAH       C  MYFSEQNKMIRKLARDFAEKELTTEILDEVEESGEFPQEILDKMAK...   
170151        6FAH       D  MYFSEQNKMIRKLARDFAEKELTTEILDEVEESGEFPQEILDKMAK...   
170152        6FAH       E  MAIKVIEEKCIGCSKCQKSCPFDAITIENKIAVIGDACTNCGTCID...   
170153        6FAH       F  MRILVCAKQVPDTNEVKIDPKTGTMIREGVPSILNPDDANALEAAL...   

        residueCount_x macromoleculeType_x                     classification  \
0                  232             Protein                         ALU DOMAIN   
1                  430             Protein        SIGNAL TRANSDUCTION PROTEIN   
2                  430             Protein        SIGNAL TRANSDUCTION PROTEIN   
3                  650             Protein  COMPLEX (TRANSDUCER/TRANSDUCTION)   
4                  650             Protein  COMPLEX (TRANSDUCER/TRANSDUCTION)   
...                ...                 ...                                ...   
170149            2074             Protein                       FLAVOPROTEIN   
170150            2074             Protein                       FLAVOPROTEIN   
170151            2074             Protein                       FLAVOPROTEIN   
170152            2074             Protein                       FLAVOPROTEIN   
170153            2074             Protein                       FLAVOPROTEIN   

       experimentalTechnique macromoleculeType_y  residueCount_y  resolution  \
0          X-RAY DIFFRACTION             Protein             232        2.53   
1          X-RAY DIFFRACTION             Protein             430        2.20   
2          X-RAY DIFFRACTION             Protein             430        2.20   
3          X-RAY DIFFRACTION             Protein             650        2.80   
4          X-RAY DIFFRACTION             Protein             650        2.80   
...                      ...                 ...             ...         ...   
170149     X-RAY DIFFRACTION             Protein            2074        3.13   
170150     X-RAY DIFFRACTION             Protein            2074        3.13   
170151     X-RAY DIFFRACTION             Protein            2074        3.13   
170152     X-RAY DIFFRACTION             Protein            2074        3.13   
170153     X-RAY DIFFRACTION             Protein            2074        3.13   

        structureMolecularWeight          crystallizationMethod  \
0                       26562.73                   hanging drop   
1                       47657.25  VAPOR DIFFUSION, SITTING DROP   
2                       47657.25  VAPOR DIFFUSION, SITTING DROP   
3                       73467.70                     MICROBATCH   
4                       73467.70                     MICROBATCH   
...                          ...                            ...   
170149                 231360.91  VAPOR DIFFUSION, SITTING DROP   
170150                 231360.91  VAPOR DIFFUSION, SITTING DROP   
170151                 231360.91  VAPOR DIFFUSION, SITTING DROP   
170152                 231360.91  VAPOR DIFFUSION, SITTING DROP   
170153                 231360.91  VAPOR DIFFUSION, SITTING DROP   

        crystallizationTempK  densityMatthews  densityPercentSol  \
0                     277.00             3.00              34.00   
1                     277.00             2.49              51.03   
2                     277.00             2.49              51.03   
3                     277.00             2.25              45.00   
4                     277.00             2.25              45.00   
...                      ...              ...                ...   
170149                293.15             3.49              64.73   
170150                293.15             3.49              64.73   
170151                293.15             3.49              64.73   
170152                293.15             3.49              64.73   
170153                293.15             3.49              64.73   

                                              pdbxDetails  phValue  \
0       THE SRPPHI14-9 PROTEIN WAS CRYSTALLIZED (BIRSE...      7.7   
1       THE PROTEIN SOLUTION CONTAINING 23.3 MG/ML OF ...      7.6   
2       THE PROTEIN SOLUTION CONTAINING 23.3 MG/ML OF ...      7.6   
3       THE PROTEIN COMPLEX (10 MG/ML SOLUTION) WAS CR...      6.8   
4       THE PROTEIN COMPLEX (10 MG/ML SOLUTION) WAS CR...      6.8   
...                                                   ...      ...   
170149               PEG 4000, Tris/HCl, Lithium sulphate      8.5   
170150               PEG 4000, Tris/HCl, Lithium sulphate      8.5   
170151               PEG 4000, Tris/HCl, Lithium sulphate      8.5   
170152               PEG 4000, Tris/HCl, Lithium sulphate      8.5   
170153               PEG 4000, Tris/HCl, Lithium sulphate      8.5   

        publicationYear  
0                1997.0  
1                1998.0  
2                1998.0  
3                1998.0  
4                1998.0  
...                 ...  
170149           2018.0  
170150           2018.0  
170151           2018.0  
170152           2018.0  
170153           2018.0  

[170154 rows x 18 columns]
Index(['structureId', 'chainId', 'sequence', 'residueCount_x',
       'macromoleculeType_x', 'classification', 'experimentalTechnique',
       'macromoleculeType_y', 'residueCount_y', 'resolution',
       'structureMolecularWeight', 'crystallizationMethod',
       'crystallizationTempK', 'densityMatthews', 'densityPercentSol',
       'pdbxDetails', 'phValue', 'publicationYear'],
      dtype='object')

# Check if the overlapping column contents are equal
print((joint_df['macromoleculeType_x'] == joint_df['macromoleculeType_y']).value_counts())
print((joint_df['residueCount_x'] == joint_df['residueCount_y']).value_counts())

True    170154
Name: count, dtype: int64
True    170154
Name: count, dtype: int64

# Both columns give True for all 170154 rows, therefore will be dropping one and renaming
joint_df['macromoleculeType'] = joint_df['macromoleculeType_x']  
joint_df.drop(columns=['macromoleculeType_x', 'macromoleculeType_y'], inplace=True)

joint_df['residueCount'] = joint_df['residueCount_x']  # or _y
joint_df.drop(columns=['residueCount_x', 'residueCount_y'], inplace=True)

# Checking shape of joint_df again
joint_df.shape

(170154, 16)

# Class of the protein is essential and distribution is to be checked
class_count = joint_df['classification'].value_counts(ascending= False)
print(class_count)

joint_df['classification'].value_counts().head(50).plot.bar(figsize=(12, 6))
plt.title("Top 50 Protein Classifications")
plt.xlabel("Classification")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

classification
HYDROLASE                         22963
TRANSFERASE                       18159
OXIDOREDUCTASE                    17007
IMMUNE SYSTEM                      9679
HYDROLASE/HYDROLASE INHIBITOR      8966
                                  ...  
Transferase, Signaling protein        1
HYDROLASE, ANTITUMOR PROTEIN          1
Ligase, Transferase                   1
ANTIVIRAL PROTEIN, HYDROLASE          1
lipid transport/activator             1
Name: count, Length: 2172, dtype: int64

# Wide distribtuion of protein class, therefore will be filter class for counts over 1000
class_filter = class_count[class_count > 1000].index
print(class_filter)
len(class_filter)

Index(['HYDROLASE', 'TRANSFERASE', 'OXIDOREDUCTASE', 'IMMUNE SYSTEM',
       'HYDROLASE/HYDROLASE INHIBITOR', 'LYASE', 'TRANSCRIPTION',
       'TRANSPORT PROTEIN', 'VIRAL PROTEIN', 'ISOMERASE', 'SIGNALING PROTEIN',
       'LIGASE', 'PROTEIN BINDING', 'TRANSFERASE/TRANSFERASE INHIBITOR',
       'MEMBRANE PROTEIN', 'SUGAR BINDING PROTEIN', 'STRUCTURAL PROTEIN',
       'CHAPERONE', 'DNA BINDING PROTEIN', 'METAL BINDING PROTEIN',
       'CELL ADHESION', 'ELECTRON TRANSPORT', 'PROTEIN TRANSPORT',
       'UNKNOWN FUNCTION', 'TOXIN', 'CELL CYCLE', 'GENE REGULATION',
       'PHOTOSYNTHESIS', 'RNA BINDING PROTEIN'],
      dtype='object', name='classification')

29

print(joint_df.shape)

# Filter joined dataframe from the classes with count number over 1000
filtered_1000 = joint_df[joint_df['classification'].isin(class_filter)].reset_index(drop=True)
print(filtered_1000.shape)

# Drop duplicates specifically on the subset
filtered_1000_dropped = filtered_1000.drop_duplicates(subset=["sequence"]).reset_index(drop=True)
print(filtered_1000_dropped.shape)

(170154, 16)
(131401, 16)
(33771, 16)

# Examining dataframe and filtering for columns that will be used for machine learning
print(filtered_1000_dropped.info())
print(filtered_1000_dropped.head())

columns = ["classification", "structureId", "sequence", "resolution", "structureMolecularWeight",
           "crystallizationTempK", "densityMatthews", "densityPercentSol", "phValue", "residueCount"]
final_data = filtered_1000_dropped[columns]

# Examine final data
print(final_data.info())
print(final_data.head())

# Final data to csv
final_data.to_csv('final_data.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33771 entries, 0 to 33770
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   structureId               33771 non-null  object 
 1   chainId                   33771 non-null  object 
 2   sequence                  33771 non-null  object 
 3   classification            33771 non-null  object 
 4   experimentalTechnique     33771 non-null  object 
 5   resolution                33771 non-null  float64
 6   structureMolecularWeight  33771 non-null  float64
 7   crystallizationMethod     33771 non-null  object 
 8   crystallizationTempK      33771 non-null  float64
 9   densityMatthews           33771 non-null  float64
 10  densityPercentSol         33771 non-null  float64
 11  pdbxDetails               33771 non-null  object 
 12  phValue                   33771 non-null  float64
 13  publicationYear           33771 non-null  float64
 14  macromoleculeType         33771 non-null  object 
 15  residueCount              33771 non-null  int64  
dtypes: float64(7), int64(1), object(8)
memory usage: 4.1+ MB
None
  structureId chainId                                           sequence  \
0        1A72       A  STAGKVIKCKAAVLWEEKKPFSIEEVEVAPPKAHEVRIKMVATGIC...   
1        1A8O       A  MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANP...   
2        1AR4       A  AVYTLPELPYDYSALEPYISGEIMELHHDKHHKAYVDGANTALDKL...   
3        1AUE       A  ELIRVAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMER...   
4        1AUK       A  RPPNIVLIFADDLGYGDLGCYGHPSSTTPNLDQLAAGGLRFTDFYV...   

   classification experimentalTechnique  resolution  structureMolecularWeight  \
0  OXIDOREDUCTASE     X-RAY DIFFRACTION        2.60                  40658.50   
1   VIRAL PROTEIN     X-RAY DIFFRACTION        1.70                   8175.72   
2  OXIDOREDUCTASE     X-RAY DIFFRACTION        1.90                  45428.53   
3     TRANSFERASE     X-RAY DIFFRACTION        2.33                  24203.73   
4       HYDROLASE     X-RAY DIFFRACTION        2.10                  52423.45   

           crystallizationMethod  crystallizationTempK  densityMatthews  \
0  VAPOR DIFFUSION, HANGING DROP                 277.0             2.30   
1  VAPOR DIFFUSION, SITTING DROP                 277.0             2.21   
2                   microseeding                 277.0             2.05   
3  VAPOR DIFFUSION, HANGING DROP                 277.0             2.25   
4  VAPOR DIFFUSION, HANGING DROP                 291.0             3.30   

   densityPercentSol                                        pdbxDetails  \
0              46.82  CRYSTALS GROWN FROM 4 MICROLITER HANGING DROPS...   
1              43.80  CRYSTALS OF CA(151-231) WERE GROWN AT 4C IN 4 ...   
2              32.00  PROTEIN WAS CRYSTALLIZED AT 50MG/ML FROM 2.15 ...   
3              45.00  HANGING DROPS AT 4 C, pH 8.0, vapor diffusion ...   
4              63.00  PROTEIN WAS CRYSTALLIZED BY VAPOR DIFFUSION IN...   

   phValue  publicationYear macromoleculeType  residueCount  
0      8.4           1998.0           Protein           374  
1      8.0           1997.0           Protein            70  
2      6.1           1996.0           Protein           402  
3      8.0           1997.0           Protein           200  
4      5.4           1998.0           Protein           489  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33771 entries, 0 to 33770
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   classification            33771 non-null  object 
 1   structureId               33771 non-null  object 
 2   sequence                  33771 non-null  object 
 3   resolution                33771 non-null  float64
 4   structureMolecularWeight  33771 non-null  float64
 5   crystallizationTempK      33771 non-null  float64
 6   densityMatthews           33771 non-null  float64
 7   densityPercentSol         33771 non-null  float64
 8   phValue                   33771 non-null  float64
 9   residueCount              33771 non-null  int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 2.6+ MB
None
   classification structureId  \
0  OXIDOREDUCTASE        1A72   
1   VIRAL PROTEIN        1A8O   
2  OXIDOREDUCTASE        1AR4   
3     TRANSFERASE        1AUE   
4       HYDROLASE        1AUK   

                                            sequence  resolution  \
0  STAGKVIKCKAAVLWEEKKPFSIEEVEVAPPKAHEVRIKMVATGIC...        2.60   
1  MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANP...        1.70   
2  AVYTLPELPYDYSALEPYISGEIMELHHDKHHKAYVDGANTALDKL...        1.90   
3  ELIRVAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMER...        2.33   
4  RPPNIVLIFADDLGYGDLGCYGHPSSTTPNLDQLAAGGLRFTDFYV...        2.10   

   structureMolecularWeight  crystallizationTempK  densityMatthews  \
0                  40658.50                 277.0             2.30   
1                   8175.72                 277.0             2.21   
2                  45428.53                 277.0             2.05   
3                  24203.73                 277.0             2.25   
4                  52423.45                 291.0             3.30   

   densityPercentSol  phValue  residueCount  
0              46.82      8.4           374  
1              43.80      8.0            70  
2              32.00      6.1           402  
3              45.00      8.0           200  
4              63.00      5.4           489