In [55]:
### Loading the Titanic dataset
# This script loads the Titanic dataset from CSV files into pandas dataframes.
import pandas as pd
# Load the training and testing data into dataframes
train_data = pd.read_csv('titanic_data/train.csv')
train_df = pd.DataFrame(train_data)
test_data = pd.read_csv('titanic_data/test.csv')
test_df = pd.DataFrame(test_data)
y_data = pd.read_csv('titanic_data/gender_submission.csv')
y_df = pd.DataFrame(y_data)
# Merge the testing data into one dataframe
test_df = pd.merge(test_df, y_df, on='PassengerId', how='left')
In [56]:
### Overview of Data before cleaning
# Check the info of the columns
print(train_df.info())
print()
print(test_df.info())
print()
# Check the unique values of the columns
for column in train_df.columns:
print(f"{column}: {len(train_df[column].unique())} unique values")
print()
for column in test_df.columns:
print(f"{column}: {len(test_df[column].unique())} unique values")
print()
#Check if there are any missing values in the data
print(train_df.isnull().sum())
print()
print(test_df.isnull().sum())
print()
# Checking out the Age column to fill in for missing values
print(train_df['Age'].describe())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB None <class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 418 non-null int64 1 Pclass 418 non-null int64 2 Name 418 non-null object 3 Sex 418 non-null object 4 Age 332 non-null float64 5 SibSp 418 non-null int64 6 Parch 418 non-null int64 7 Ticket 418 non-null object 8 Fare 417 non-null float64 9 Cabin 91 non-null object 10 Embarked 418 non-null object 11 Survived 418 non-null int64 dtypes: float64(2), int64(5), object(5) memory usage: 39.3+ KB None PassengerId: 891 unique values Survived: 2 unique values Pclass: 3 unique values Name: 891 unique values Sex: 2 unique values Age: 89 unique values SibSp: 7 unique values Parch: 7 unique values Ticket: 681 unique values Fare: 248 unique values Cabin: 148 unique values Embarked: 4 unique values PassengerId: 418 unique values Pclass: 3 unique values Name: 418 unique values Sex: 2 unique values Age: 80 unique values SibSp: 7 unique values Parch: 8 unique values Ticket: 363 unique values Fare: 170 unique values Cabin: 77 unique values Embarked: 3 unique values Survived: 2 unique values PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64 PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 Survived 0 dtype: int64 count 714.000000 mean 29.699118 std 14.526497 min 0.420000 25% 20.125000 50% 28.000000 75% 38.000000 max 80.000000 Name: Age, dtype: float64
In [57]:
### Data claning and preprocessing
# Change datatype of columns for EDA
train_df['Sex'] = train_df['Sex'].map({'male':1, 'female':0})
test_df['Sex'] = test_df['Sex'].map({'male':1, 'female':0})
# Dropping columns that are not needed for the analysis (before EDA)
train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'], axis=1, inplace=True)
test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'], axis=1, inplace=True)
# Filling in the missing values in the Age column and Fare column with the mean of the column
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
train_df['Fare'].fillna(train_df['Fare'].mean(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)
print(train_df.info())
print()
print(test_df.info())
print(train_df.shape)
print(test_df.shape)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Survived 891 non-null int64 1 Pclass 891 non-null int64 2 Sex 891 non-null int64 3 Age 891 non-null float64 4 SibSp 891 non-null int64 5 Parch 891 non-null int64 6 Fare 891 non-null float64 dtypes: float64(2), int64(5) memory usage: 48.9 KB None <class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pclass 418 non-null int64 1 Sex 418 non-null int64 2 Age 418 non-null float64 3 SibSp 418 non-null int64 4 Parch 418 non-null int64 5 Fare 418 non-null float64 6 Survived 418 non-null int64 dtypes: float64(2), int64(5) memory usage: 23.0 KB None (891, 7) (418, 7)
/var/folders/gn/8692wbbn2jqbdbljkddblglm0000gn/T/ipykernel_75910/7069712.py:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. train_df['Age'].fillna(train_df['Age'].mean(), inplace=True) /var/folders/gn/8692wbbn2jqbdbljkddblglm0000gn/T/ipykernel_75910/7069712.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. test_df['Age'].fillna(test_df['Age'].mean(), inplace=True) /var/folders/gn/8692wbbn2jqbdbljkddblglm0000gn/T/ipykernel_75910/7069712.py:14: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. train_df['Fare'].fillna(train_df['Fare'].mean(), inplace=True) /var/folders/gn/8692wbbn2jqbdbljkddblglm0000gn/T/ipykernel_75910/7069712.py:15: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method. The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy. For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object. test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)
In [58]:
### EDA
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Check correlation between the columns
plt.figure(figsize=(5, 5))
sns.heatmap(train_df.corr(numeric_only=True), annot=True, fmt='.2f')
plt.title(f'Correlation Matrix')
plt.show()
# Check the correlation between the feature and the target variable
feature_columns = ["Pclass", "Age", "SibSp", "Parch", "Fare","Sex"]
for column in feature_columns:
plt.figure(figsize=(5, 5))
sns.histplot(x=column, hue="Survived", data=train_df, alpha=0.2, multiple='dodge', shrink=0.8)
plt.title(f'{column} vs Survived')
plt.show()
In [59]:
### data preprocessing
# Usually will split the data with train_test_split but for this case we will not
X_train = train_df[['Sex', 'Fare', 'Age', 'SibSp', 'Parch', 'Pclass']]
y_train = train_df['Survived']
X_test = test_df[['Sex', 'Fare', 'Age', 'SibSp', 'Parch', 'Pclass']]
y_test = test_df['Survived']
# Normalizing the data for fares
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [60]:
#Model generation
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model_name = []
acc_score = []
for model in [LogisticRegression, KNeighborsClassifier, RandomForestClassifier, DecisionTreeClassifier, GradientBoostingClassifier]:
model = model()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
print(f"Model: {model.__class__.__name__} || Accuracy: {accuracy_score(y_pred, y_test)}")
acc_score.append(accuracy_score(y_pred, y_test))
model_name.append(model.__class__.__name__)
model_dict = dict(zip(model_name, acc_score))
max_accuracy_model = max(model_dict, key=model_dict.get)
max_accuracy = model_dict[max_accuracy_model]
print()
print(f"Best Model: {max_accuracy_model} || Accuracy: {max_accuracy}")
Model: LogisticRegression || Accuracy: 0.9473684210526315 Model: KNeighborsClassifier || Accuracy: 0.8253588516746412 Model: RandomForestClassifier || Accuracy: 0.8133971291866029 Model: DecisionTreeClassifier || Accuracy: 0.7894736842105263 Model: GradientBoostingClassifier || Accuracy: 0.8732057416267942 Best Model: LogisticRegression || Accuracy: 0.9473684210526315