forked from ThePhilosophersKaggle/ClassicModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic.py
68 lines (56 loc) · 2.5 KB
/
titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
# Load data
dir_path = r""
df_train = pd.read_csv(dir_path + r"train.csv")
df_test = pd.read_csv(dir_path + r"test.csv")
labels = pd.read_csv(dir_path + r"gender_submission.csv")
df_test = df_test.merge(labels, on=["PassengerId"])
# List of variables to use
categorical_variables = ["Pclass", "Sex", "Embarked"]
numeric_variables = ["Age", "SibSp", "Parch", "Fare"]
total_list = categorical_variables + numeric_variables + ["Survived"]
df_train = df_train[total_list]
df_test = df_test[total_list]
# Change to good category type
def to_good_type(df, categorical_colums, numeric_columns):
for col in categorical_colums:
df[col] = df[col].astype('category')
for col in numeric_columns:
df[col] = df[col].astype('float')
return df
df_train = to_good_type(df_train, categorical_variables, numeric_variables)
df_test = to_good_type(df_test, categorical_variables, numeric_variables)
# To categorical
df_train = pd.get_dummies(df_train, dummy_na=True)
df_test = pd.get_dummies(df_test, dummy_na=True)
# Deal with NaN for numeric variables
for col in numeric_variables:
df_train[col] = df_train[col].fillna(df_train[col].mean())
df_test[col] = df_test[col].fillna(df_test[col].mean())
# Minmax numeric columns
scaler = MinMaxScaler(feature_range=(0, 1))
df_train[numeric_variables] = scaler.fit_transform(df_train[numeric_variables])
df_test[numeric_variables] = scaler.fit_transform(df_test[numeric_variables])
# Build a model
# xgboost
from xgboost import XGBClassifier
# fit model no training data
model = XGBClassifier()
data_columns = [i for i in df_train.columns if i not in ["Survived"]]
model.fit(df_train[data_columns].values, df_train["Survived"].values)
# make predictions for test datascikit
y_pred = model.predict(df_train[data_columns].values)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(df_train["Survived"].values, predictions)
print("Accuracy on training set: %.2f%%" % (accuracy * 100.0))
print(classification_report(df_train["Survived"].values, predictions))
y_pred = model.predict(df_test[data_columns].values)
predictions = [round(value) for value in y_pred]
df_test = pd.read_csv(dir_path + r"test.csv")
output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")