구분 | 설명 | 값 |
---|---|---|
Survival | 생존 여부 | Survival. 0 = No, 1 = Yes |
Pclass | 티켓의 클래스 | Ticket class. 1 = 1st, 2 = 2nd, 3 = 3rd |
Sex | 성별(Sex) | 남(male)/여(female) |
Age | 나이(Age in years.) | |
SibSp | 함께 탑승한 형제와 배우자의 수 /siblings, spouses aboard the Titanic. | |
Parch | 함께 탑승한 부모, 아이의 수 | # of parents / children aboard the Titanic. |
Ticket | 티켓 번호(Ticket number) | (ex) CA 31352, A/5. 2151 |
Fare | 탑승료(Passenger fare) | |
Cabin | 객실 번호(Cabin number) | |
Embarked | 탑승 항구(Port of Embarkation) | C = Cherbourg, Q = Queenstown, S = Southampton |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.read_csv("data/titanic/train.csv")
test = pd.read_csv("data/titanic/test.csv")
sub = pd.read_csv("data/titanic/gender_submission.csv")
plt.figure(figsize=(10,7))
sns.heatmap(train.isnull(), yticklabels=False, cbar=False) # cbar : colorbar를 그리지 않음.
plt.figure(figsize=(10,7))
sns.heatmap(test.isnull(), yticklabels=False, cbar=False) # cbar : colorbar를 그리지 않음.
sns.set_style('whitegrid')
sns.countplot(x='Survived', data=train)
sns.distplot(train['Age'].dropna(), bins=30)
f,ax=plt.subplots(1,2,figsize=(18,8))
# 첫번째 그래프
sns.distplot(train['Age'].dropna(), bins=30,ax=ax[0])
ax[0].set_title('train - Age')
# 두번째 그래프
sns.distplot(test['Age'].dropna(), bins=30,ax=ax[1])
ax[1].set_title('test - Age')
plt.show()
print( train.info() )
print()
print( test.info() )
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
print(train.isnull().sum())
print(test.isnull().sum())
train['Embarked'].value_counts()
train['Embarked'] = train['Embarked'].fillna('S')
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
print(train.isnull().sum())
print(test.isnull().sum())
train.info()
test.info()
sel = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
all_X = train[sel]
all_y = train['Survived']
last_X_test = test[sel]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(all_X,
all_y,
stratify=all_y,
test_size=0.3,
random_state=77 )
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier().fit(X_train, y_train)
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
acc_tr, acc_test
depth_param = range(1,10)
for i in depth_param:
model = DecisionTreeClassifier(max_depth=i).fit(X_train, y_train)
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
print("max_depth : {} , 정확도 : {} {}".format(i, acc_tr, acc_test) )
model = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)
model.fit(all_X, all_y)
pred = model.predict(last_X_test)
sub['Survived'] = pred
sub.to_csv("tree_second_sub.csv", index=False) # 0.65879
import os
files = os.listdir()
print("파일 유무 확인 : ", "tree_second_sub.csv" in files)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
model = LogisticRegression().fit(X_train, y_train)
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
acc_tr, acc_test
model = LinearSVC().fit(X_train, y_train)
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
acc_tr, acc_test
model = KNeighborsClassifier().fit(X_train, y_train)
acc_tr = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)
acc_tr, acc_test
model = LogisticRegression()
model.fit(all_X, all_y)
pred = model.predict(last_X_test)
sub['Survived'] = pred
sub.to_csv("third_lgreg_sub.csv", index=False)
files = os.listdir()
print("파일 유무 확인 : ", "third_lgreg_sub.csv" in files) # 0.66746