In [127]:
# Import library
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Import Datasets
df_seq = pd.read_csv('data/pdb_data_seq.csv')
df_char = pd.read_csv('data/pdb_data_no_dups.csv')
In [128]:
# Look through df_seq DataFrame
print(df_seq.info())
print()
print(df_seq.isnull().sum())
print()
print(df_seq['macromoleculeType'].value_counts(dropna=False))
# Filter for protein sequences only
protein_seq = df_seq[df_seq['macromoleculeType'] == 'Protein']
print(protein_seq)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 467304 entries, 0 to 467303 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 structureId 467304 non-null object 1 chainId 467294 non-null object 2 sequence 467276 non-null object 3 residueCount 467304 non-null int64 4 macromoleculeType 432487 non-null object dtypes: int64(1), object(4) memory usage: 17.8+ MB None structureId 0 chainId 10 sequence 28 residueCount 0 macromoleculeType 34817 dtype: int64 macromoleculeType Protein 345180 Protein#RNA 56226 NaN 34817 Protein#DNA 21303 DNA 3784 Protein#DNA#RNA 2712 RNA 2389 Protein#RNA#DNA/RNA Hybrid 304 Protein#DNA#DNA/RNA Hybrid 159 DNA/RNA Hybrid 141 DNA#RNA 121 RNA#DNA/RNA Hybrid 74 Protein#DNA/RNA Hybrid 68 DNA#DNA/RNA Hybrid 26 Name: count, dtype: int64 structureId chainId sequence \ 4 101M A MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR... 7 102L A MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE... 8 102M A MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR... 11 103L A MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK... 12 103M A MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR... ... ... ... ... 467299 9XIA A MNYQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVESVQRLAEL... 467300 9XIM A SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG... 467301 9XIM B SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG... 467302 9XIM C SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG... 467303 9XIM D SVQATREDKFSFGLWTVGWQARDAFGDATRTALDPVEAVHKLAEIG... residueCount macromoleculeType 4 154 Protein 7 165 Protein 8 154 Protein 11 167 Protein 12 154 Protein ... ... ... 467299 388 Protein 467300 1572 Protein 467301 1572 Protein 467302 1572 Protein 467303 1572 Protein [345180 rows x 5 columns]
In [129]:
# Examine df_char DataFrame
print(df_char.info())
print()
print(df_char.isnull().sum())
print()
print(df_char['macromoleculeType'].value_counts(dropna = False))
# Filter to macromolecule type = protein
protein_char = df_char[df_char['macromoleculeType'] == 'Protein']
print(protein_char)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 141401 entries, 0 to 141400 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 structureId 141401 non-null object 1 classification 141399 non-null object 2 experimentalTechnique 141401 non-null object 3 macromoleculeType 137636 non-null object 4 residueCount 141401 non-null int64 5 resolution 128589 non-null float64 6 structureMolecularWeight 141401 non-null float64 7 crystallizationMethod 96242 non-null object 8 crystallizationTempK 97039 non-null float64 9 densityMatthews 124724 non-null float64 10 densityPercentSol 124749 non-null float64 11 pdbxDetails 118534 non-null object 12 phValue 105110 non-null float64 13 publicationYear 117602 non-null float64 dtypes: float64(7), int64(1), object(6) memory usage: 15.1+ MB None structureId 0 classification 2 experimentalTechnique 0 macromoleculeType 3765 residueCount 0 resolution 12812 structureMolecularWeight 0 crystallizationMethod 45159 crystallizationTempK 44362 densityMatthews 16677 densityPercentSol 16652 pdbxDetails 22867 phValue 36291 publicationYear 23799 dtype: int64 macromoleculeType Protein 127798 Protein#DNA 4176 NaN 3765 Protein#RNA 2162 DNA 1744 RNA 1295 Protein#DNA#RNA 250 DNA/RNA Hybrid 58 DNA#RNA 51 Protein#DNA#DNA/RNA Hybrid 34 RNA#DNA/RNA Hybrid 27 Protein#DNA/RNA Hybrid 19 DNA#DNA/RNA Hybrid 13 Protein#RNA#DNA/RNA Hybrid 9 Name: count, dtype: int64 structureId classification \ 2 101M OXYGEN TRANSPORT 4 102L HYDROLASE(O-GLYCOSYL) 5 102M OXYGEN TRANSPORT 7 103L HYDROLASE(O-GLYCOSYL) 8 103M OXYGEN TRANSPORT ... ... ... 141395 9RSA HYDROLASE (PHOSPHORIC DIESTER) 141396 9RUB LYASE(CARBON-CARBON) 141398 9WGA LECTIN (AGGLUTININ) 141399 9XIA ISOMERASE(INTRAMOLECULAR OXIDOREDUCTASE) 141400 9XIM ISOMERASE(INTRAMOLECULAR OXIDOREDUCTASE) experimentalTechnique macromoleculeType residueCount resolution \ 2 X-RAY DIFFRACTION Protein 154 2.07 4 X-RAY DIFFRACTION Protein 165 1.74 5 X-RAY DIFFRACTION Protein 154 1.84 7 X-RAY DIFFRACTION Protein 167 1.90 8 X-RAY DIFFRACTION Protein 154 2.07 ... ... ... ... ... 141395 X-RAY DIFFRACTION Protein 248 1.80 141396 X-RAY DIFFRACTION Protein 932 2.60 141398 X-RAY DIFFRACTION Protein 342 1.80 141399 X-RAY DIFFRACTION Protein 388 1.90 141400 X-RAY DIFFRACTION Protein 1572 2.40 structureMolecularWeight crystallizationMethod crystallizationTempK \ 2 18112.80 NaN NaN 4 18926.61 NaN NaN 5 18010.64 NaN NaN 7 19092.72 NaN NaN 8 18093.78 NaN NaN ... ... ... ... 141395 27987.16 NaN NaN 141396 101838.68 NaN NaN 141398 34270.22 NaN NaN 141399 43542.29 NaN NaN 141400 174722.12 NaN NaN densityMatthews densityPercentSol \ 2 3.09 60.20 4 2.75 55.28 5 3.09 60.20 7 2.70 54.46 8 3.09 60.30 ... ... ... 141395 2.25 45.45 141396 2.38 48.29 141398 2.50 50.76 141399 2.79 55.93 141400 3.96 68.92 pdbxDetails phValue \ 2 3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ... 9.0 4 NaN NaN 5 3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ... 9.0 7 NaN NaN 8 3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ... 9.0 ... ... ... 141395 NaN NaN 141396 NaN NaN 141398 NaN NaN 141399 NaN NaN 141400 NaN NaN publicationYear 2 1999.0 4 1993.0 5 1999.0 7 1993.0 8 1999.0 ... ... 141395 1990.0 141396 1991.0 141398 1990.0 141399 1989.0 141400 1992.0 [127798 rows x 14 columns]
In [130]:
# Inner join two DataFrames by structureID
joint_df = pd.merge(protein_seq, protein_char, on = 'structureId')
# Examine joined df for missing values
print(joint_df.info())
# Drop missing values
joint_df = joint_df.dropna().reset_index(drop = True)
print(joint_df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 346325 entries, 0 to 346324 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 structureId 346325 non-null object 1 chainId 346323 non-null object 2 sequence 346322 non-null object 3 residueCount_x 346325 non-null int64 4 macromoleculeType_x 346325 non-null object 5 classification 346324 non-null object 6 experimentalTechnique 346325 non-null object 7 macromoleculeType_y 346325 non-null object 8 residueCount_y 346325 non-null int64 9 resolution 330232 non-null float64 10 structureMolecularWeight 346325 non-null float64 11 crystallizationMethod 240597 non-null object 12 crystallizationTempK 243934 non-null float64 13 densityMatthews 307481 non-null float64 14 densityPercentSol 307615 non-null float64 15 pdbxDetails 294697 non-null object 16 phValue 259130 non-null float64 17 publicationYear 295363 non-null float64 dtypes: float64(7), int64(2), object(9) memory usage: 47.6+ MB None <class 'pandas.core.frame.DataFrame'> RangeIndex: 170154 entries, 0 to 170153 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 structureId 170154 non-null object 1 chainId 170154 non-null object 2 sequence 170154 non-null object 3 residueCount_x 170154 non-null int64 4 macromoleculeType_x 170154 non-null object 5 classification 170154 non-null object 6 experimentalTechnique 170154 non-null object 7 macromoleculeType_y 170154 non-null object 8 residueCount_y 170154 non-null int64 9 resolution 170154 non-null float64 10 structureMolecularWeight 170154 non-null float64 11 crystallizationMethod 170154 non-null object 12 crystallizationTempK 170154 non-null float64 13 densityMatthews 170154 non-null float64 14 densityPercentSol 170154 non-null float64 15 pdbxDetails 170154 non-null object 16 phValue 170154 non-null float64 17 publicationYear 170154 non-null float64 dtypes: float64(7), int64(2), object(9) memory usage: 23.4+ MB None
In [131]:
# Overview of joined dataframe
print(joint_df)
print(joint_df.columns)
structureId chainId sequence \ 0 1914 A MASMTGGQQMGRIPGNSPRMVLLESEQFLTELTRLFQKCRSSGSVF... 1 1A04 A SNQEPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIEL... 2 1A04 B SNQEPATILLIDDHPMLRTGVKQLISMAPDITVVGEASNGEQGIEL... 3 1A0R B XSELDQLRQEAEQLKNQIRDARKACADATLSQITNNIDPVGRIQMR... 4 1A0R G PVINIEDLTEKDKLKMEVDQLKKEVTLERMLVSKCCEEFRDYVEER... ... ... ... ... 170149 6FAH B MRILVCAKQVPDTNEVKIDPKTGTMIREGVPSILNPDDANALEAAL... 170150 6FAH C MYFSEQNKMIRKLARDFAEKELTTEILDEVEESGEFPQEILDKMAK... 170151 6FAH D MYFSEQNKMIRKLARDFAEKELTTEILDEVEESGEFPQEILDKMAK... 170152 6FAH E MAIKVIEEKCIGCSKCQKSCPFDAITIENKIAVIGDACTNCGTCID... 170153 6FAH F MRILVCAKQVPDTNEVKIDPKTGTMIREGVPSILNPDDANALEAAL... residueCount_x macromoleculeType_x classification \ 0 232 Protein ALU DOMAIN 1 430 Protein SIGNAL TRANSDUCTION PROTEIN 2 430 Protein SIGNAL TRANSDUCTION PROTEIN 3 650 Protein COMPLEX (TRANSDUCER/TRANSDUCTION) 4 650 Protein COMPLEX (TRANSDUCER/TRANSDUCTION) ... ... ... ... 170149 2074 Protein FLAVOPROTEIN 170150 2074 Protein FLAVOPROTEIN 170151 2074 Protein FLAVOPROTEIN 170152 2074 Protein FLAVOPROTEIN 170153 2074 Protein FLAVOPROTEIN experimentalTechnique macromoleculeType_y residueCount_y resolution \ 0 X-RAY DIFFRACTION Protein 232 2.53 1 X-RAY DIFFRACTION Protein 430 2.20 2 X-RAY DIFFRACTION Protein 430 2.20 3 X-RAY DIFFRACTION Protein 650 2.80 4 X-RAY DIFFRACTION Protein 650 2.80 ... ... ... ... ... 170149 X-RAY DIFFRACTION Protein 2074 3.13 170150 X-RAY DIFFRACTION Protein 2074 3.13 170151 X-RAY DIFFRACTION Protein 2074 3.13 170152 X-RAY DIFFRACTION Protein 2074 3.13 170153 X-RAY DIFFRACTION Protein 2074 3.13 structureMolecularWeight crystallizationMethod \ 0 26562.73 hanging drop 1 47657.25 VAPOR DIFFUSION, SITTING DROP 2 47657.25 VAPOR DIFFUSION, SITTING DROP 3 73467.70 MICROBATCH 4 73467.70 MICROBATCH ... ... ... 170149 231360.91 VAPOR DIFFUSION, SITTING DROP 170150 231360.91 VAPOR DIFFUSION, SITTING DROP 170151 231360.91 VAPOR DIFFUSION, SITTING DROP 170152 231360.91 VAPOR DIFFUSION, SITTING DROP 170153 231360.91 VAPOR DIFFUSION, SITTING DROP crystallizationTempK densityMatthews densityPercentSol \ 0 277.00 3.00 34.00 1 277.00 2.49 51.03 2 277.00 2.49 51.03 3 277.00 2.25 45.00 4 277.00 2.25 45.00 ... ... ... ... 170149 293.15 3.49 64.73 170150 293.15 3.49 64.73 170151 293.15 3.49 64.73 170152 293.15 3.49 64.73 170153 293.15 3.49 64.73 pdbxDetails phValue \ 0 THE SRPPHI14-9 PROTEIN WAS CRYSTALLIZED (BIRSE... 7.7 1 THE PROTEIN SOLUTION CONTAINING 23.3 MG/ML OF ... 7.6 2 THE PROTEIN SOLUTION CONTAINING 23.3 MG/ML OF ... 7.6 3 THE PROTEIN COMPLEX (10 MG/ML SOLUTION) WAS CR... 6.8 4 THE PROTEIN COMPLEX (10 MG/ML SOLUTION) WAS CR... 6.8 ... ... ... 170149 PEG 4000, Tris/HCl, Lithium sulphate 8.5 170150 PEG 4000, Tris/HCl, Lithium sulphate 8.5 170151 PEG 4000, Tris/HCl, Lithium sulphate 8.5 170152 PEG 4000, Tris/HCl, Lithium sulphate 8.5 170153 PEG 4000, Tris/HCl, Lithium sulphate 8.5 publicationYear 0 1997.0 1 1998.0 2 1998.0 3 1998.0 4 1998.0 ... ... 170149 2018.0 170150 2018.0 170151 2018.0 170152 2018.0 170153 2018.0 [170154 rows x 18 columns] Index(['structureId', 'chainId', 'sequence', 'residueCount_x', 'macromoleculeType_x', 'classification', 'experimentalTechnique', 'macromoleculeType_y', 'residueCount_y', 'resolution', 'structureMolecularWeight', 'crystallizationMethod', 'crystallizationTempK', 'densityMatthews', 'densityPercentSol', 'pdbxDetails', 'phValue', 'publicationYear'], dtype='object')
In [132]:
# Check if the overlapping column contents are equal
print((joint_df['macromoleculeType_x'] == joint_df['macromoleculeType_y']).value_counts())
print((joint_df['residueCount_x'] == joint_df['residueCount_y']).value_counts())
True 170154 Name: count, dtype: int64 True 170154 Name: count, dtype: int64
In [133]:
# Both columns give True for all 170154 rows, therefore will be dropping one and renaming
joint_df['macromoleculeType'] = joint_df['macromoleculeType_x']
joint_df.drop(columns=['macromoleculeType_x', 'macromoleculeType_y'], inplace=True)
joint_df['residueCount'] = joint_df['residueCount_x'] # or _y
joint_df.drop(columns=['residueCount_x', 'residueCount_y'], inplace=True)
In [134]:
# Checking shape of joint_df again
joint_df.shape
Out[134]:
(170154, 16)
In [135]:
# Class of the protein is essential and distribution is to be checked
class_count = joint_df['classification'].value_counts(ascending= False)
print(class_count)
joint_df['classification'].value_counts().head(50).plot.bar(figsize=(12, 6))
plt.title("Top 50 Protein Classifications")
plt.xlabel("Classification")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
classification HYDROLASE 22963 TRANSFERASE 18159 OXIDOREDUCTASE 17007 IMMUNE SYSTEM 9679 HYDROLASE/HYDROLASE INHIBITOR 8966 ... Transferase, Signaling protein 1 HYDROLASE, ANTITUMOR PROTEIN 1 Ligase, Transferase 1 ANTIVIRAL PROTEIN, HYDROLASE 1 lipid transport/activator 1 Name: count, Length: 2172, dtype: int64
In [136]:
# Wide distribtuion of protein class, therefore will be filter class for counts over 1000
class_filter = class_count[class_count > 1000].index
print(class_filter)
len(class_filter)
Index(['HYDROLASE', 'TRANSFERASE', 'OXIDOREDUCTASE', 'IMMUNE SYSTEM', 'HYDROLASE/HYDROLASE INHIBITOR', 'LYASE', 'TRANSCRIPTION', 'TRANSPORT PROTEIN', 'VIRAL PROTEIN', 'ISOMERASE', 'SIGNALING PROTEIN', 'LIGASE', 'PROTEIN BINDING', 'TRANSFERASE/TRANSFERASE INHIBITOR', 'MEMBRANE PROTEIN', 'SUGAR BINDING PROTEIN', 'STRUCTURAL PROTEIN', 'CHAPERONE', 'DNA BINDING PROTEIN', 'METAL BINDING PROTEIN', 'CELL ADHESION', 'ELECTRON TRANSPORT', 'PROTEIN TRANSPORT', 'UNKNOWN FUNCTION', 'TOXIN', 'CELL CYCLE', 'GENE REGULATION', 'PHOTOSYNTHESIS', 'RNA BINDING PROTEIN'], dtype='object', name='classification')
Out[136]:
29
In [137]:
print(joint_df.shape)
# Filter joined dataframe from the classes with count number over 1000
filtered_1000 = joint_df[joint_df['classification'].isin(class_filter)].reset_index(drop=True)
print(filtered_1000.shape)
# Drop duplicates specifically on the subset
filtered_1000_dropped = filtered_1000.drop_duplicates(subset=["sequence"]).reset_index(drop=True)
print(filtered_1000_dropped.shape)
(170154, 16) (131401, 16) (33771, 16)
In [138]:
# Examining dataframe and filtering for columns that will be used for machine learning
print(filtered_1000_dropped.info())
print(filtered_1000_dropped.head())
columns = ["classification", "structureId", "sequence", "resolution", "structureMolecularWeight",
"crystallizationTempK", "densityMatthews", "densityPercentSol", "phValue", "residueCount"]
final_data = filtered_1000_dropped[columns]
# Examine final data
print(final_data.info())
print(final_data.head())
# Final data to csv
final_data.to_csv('final_data.csv')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 33771 entries, 0 to 33770 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 structureId 33771 non-null object 1 chainId 33771 non-null object 2 sequence 33771 non-null object 3 classification 33771 non-null object 4 experimentalTechnique 33771 non-null object 5 resolution 33771 non-null float64 6 structureMolecularWeight 33771 non-null float64 7 crystallizationMethod 33771 non-null object 8 crystallizationTempK 33771 non-null float64 9 densityMatthews 33771 non-null float64 10 densityPercentSol 33771 non-null float64 11 pdbxDetails 33771 non-null object 12 phValue 33771 non-null float64 13 publicationYear 33771 non-null float64 14 macromoleculeType 33771 non-null object 15 residueCount 33771 non-null int64 dtypes: float64(7), int64(1), object(8) memory usage: 4.1+ MB None structureId chainId sequence \ 0 1A72 A STAGKVIKCKAAVLWEEKKPFSIEEVEVAPPKAHEVRIKMVATGIC... 1 1A8O A MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANP... 2 1AR4 A AVYTLPELPYDYSALEPYISGEIMELHHDKHHKAYVDGANTALDKL... 3 1AUE A ELIRVAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMER... 4 1AUK A RPPNIVLIFADDLGYGDLGCYGHPSSTTPNLDQLAAGGLRFTDFYV... classification experimentalTechnique resolution structureMolecularWeight \ 0 OXIDOREDUCTASE X-RAY DIFFRACTION 2.60 40658.50 1 VIRAL PROTEIN X-RAY DIFFRACTION 1.70 8175.72 2 OXIDOREDUCTASE X-RAY DIFFRACTION 1.90 45428.53 3 TRANSFERASE X-RAY DIFFRACTION 2.33 24203.73 4 HYDROLASE X-RAY DIFFRACTION 2.10 52423.45 crystallizationMethod crystallizationTempK densityMatthews \ 0 VAPOR DIFFUSION, HANGING DROP 277.0 2.30 1 VAPOR DIFFUSION, SITTING DROP 277.0 2.21 2 microseeding 277.0 2.05 3 VAPOR DIFFUSION, HANGING DROP 277.0 2.25 4 VAPOR DIFFUSION, HANGING DROP 291.0 3.30 densityPercentSol pdbxDetails \ 0 46.82 CRYSTALS GROWN FROM 4 MICROLITER HANGING DROPS... 1 43.80 CRYSTALS OF CA(151-231) WERE GROWN AT 4C IN 4 ... 2 32.00 PROTEIN WAS CRYSTALLIZED AT 50MG/ML FROM 2.15 ... 3 45.00 HANGING DROPS AT 4 C, pH 8.0, vapor diffusion ... 4 63.00 PROTEIN WAS CRYSTALLIZED BY VAPOR DIFFUSION IN... phValue publicationYear macromoleculeType residueCount 0 8.4 1998.0 Protein 374 1 8.0 1997.0 Protein 70 2 6.1 1996.0 Protein 402 3 8.0 1997.0 Protein 200 4 5.4 1998.0 Protein 489 <class 'pandas.core.frame.DataFrame'> RangeIndex: 33771 entries, 0 to 33770 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 classification 33771 non-null object 1 structureId 33771 non-null object 2 sequence 33771 non-null object 3 resolution 33771 non-null float64 4 structureMolecularWeight 33771 non-null float64 5 crystallizationTempK 33771 non-null float64 6 densityMatthews 33771 non-null float64 7 densityPercentSol 33771 non-null float64 8 phValue 33771 non-null float64 9 residueCount 33771 non-null int64 dtypes: float64(6), int64(1), object(3) memory usage: 2.6+ MB None classification structureId \ 0 OXIDOREDUCTASE 1A72 1 VIRAL PROTEIN 1A8O 2 OXIDOREDUCTASE 1AR4 3 TRANSFERASE 1AUE 4 HYDROLASE 1AUK sequence resolution \ 0 STAGKVIKCKAAVLWEEKKPFSIEEVEVAPPKAHEVRIKMVATGIC... 2.60 1 MDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANP... 1.70 2 AVYTLPELPYDYSALEPYISGEIMELHHDKHHKAYVDGANTALDKL... 1.90 3 ELIRVAILWHEMWHEGLEEASRLYFGERNVKGMFEVLEPLHAMMER... 2.33 4 RPPNIVLIFADDLGYGDLGCYGHPSSTTPNLDQLAAGGLRFTDFYV... 2.10 structureMolecularWeight crystallizationTempK densityMatthews \ 0 40658.50 277.0 2.30 1 8175.72 277.0 2.21 2 45428.53 277.0 2.05 3 24203.73 277.0 2.25 4 52423.45 291.0 3.30 densityPercentSol phValue residueCount 0 46.82 8.4 374 1 43.80 8.0 70 2 32.00 6.1 402 3 45.00 8.0 200 4 63.00 5.4 489