import matplotlib
from matplotlib import font_manager, rc
import platform
import warnings
warnings.filterwarnings(action='ignore')
### 한글
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
matplotlib.rcParams['axes.unicode_minus'] = False
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
train = pd.read_csv("./data/titanic/train.csv")
test = pd.read_csv("./data/titanic/test.csv")
sub = pd.read_csv("./data/titanic/gender_submission.csv")
train.shape, test.shape, sub.shape
((891, 12), (418, 11), (418, 2))
train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train.isnull().sum(), test.isnull().sum()
(PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64, PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64)
print( train['Embarked'].value_counts() )
S 644 C 168 Q 77 Name: Embarked, dtype: int64
train.loc[ train['Embarked'].isnull(), 'Embarked' ] = 'S'
test['Fare'] = test['Fare'].fillna( test['Fare'].median() )
train.isnull().sum(), test.isnull().sum()
(PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 0 dtype: int64, PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 327 Embarked 0 dtype: int64)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
cat_Sex = {'male':1, 'female':2}
train['Sex'] = train['Sex'].map(cat_Sex).astype("int32")
test['Sex'] = test['Sex'].map(cat_Sex).astype("int32")
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 2 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 2 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 2 | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
cat_Embarked = {'S':1, 'C':2, 'Q':3}
train['Embarked'] = train['Embarked'].map(cat_Embarked)
test['Embarked'] = test['Embarked'].map(cat_Embarked)
train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 1 |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 2 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 2 |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | 2 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 1 |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 2 | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 1 |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 1 |
train.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
sel = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_tr = train[sel]
y_tr = train['Survived']
X_last_test = test[sel]
X_train, X_test, y_train , y_test = train_test_split(X_tr, y_tr, stratify=y_tr, random_state=0, test_size=0.2)
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
svc = SVC(gamma=.05).fit(X_train, y_train)
tree = DecisionTreeClassifier().fit(X_train, y_train)
rf = RandomForestClassifier().fit(X_train, y_train)
pred = svc.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.76 0.81 0.78 110 1 0.66 0.59 0.63 69 accuracy 0.73 179 macro avg 0.71 0.70 0.71 179 weighted avg 0.72 0.73 0.72 179
pred = tree.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.79 0.81 0.80 110 1 0.68 0.65 0.67 69 accuracy 0.75 179 macro avg 0.73 0.73 0.73 179 weighted avg 0.75 0.75 0.75 179
pred = rf.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.78 0.77 0.78 110 1 0.64 0.65 0.65 69 accuracy 0.73 179 macro avg 0.71 0.71 0.71 179 weighted avg 0.73 0.73 0.73 179
from sklearn.metrics import precision_recall_curve
svc = SVC(gamma=.05).fit(X_train, y_train)
pred_svc = svc.decision_function(X_test) # 0의 값을 기준으로 분포
print(pred[0:10])
[0.72 0.27640421 0.65 0.27640421 1. 0. 0.28250037 0.84812157 0.2 0.09069573]
precision, recall, thresholds = precision_recall_curve(y_test, pred_svc)
print("임계값 : ", thresholds.min(), thresholds.max())
# 0에 가까운 임계값을 찾습니다
close_zero = np.argmin(np.abs(thresholds)) # thresholds의 절대값이 가장 작은 것(위치)
print(close_zero)
plt.plot(precision[close_zero],
recall[close_zero], 'o',
markersize=10,
label="임계값 0",
fillstyle="none", c='k')
plt.plot(precision, recall, label="정밀도-재현율 곡선")
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")
임계값 : -1.1620575738476628 1.2594963931265546 74
<matplotlib.legend.Legend at 0x2346e180880>
rf = RandomForestClassifier().fit(X_train, y_train)
pred_rf = rf.predict_proba(X_test)[:, 1] # 0의 값을 기준으로 분포
print(pred[0:10])
[0.72 0.27640421 0.65 0.27640421 1. 0. 0.28250037 0.84812157 0.2 0.09069573]
# precision, recall, thresholds = precision_recall_curve(y_test, pred_svc)
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, pred_rf)
plt.plot(precision, recall, label="svc")
plt.plot(precision_rf, recall_rf, label="랜덤포레스트")
plt.title("정밀도-재현율 곡선")
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x2346dfd9a60>
tree = DecisionTreeClassifier().fit(X_train, y_train)
pred_tree = tree.predict_proba(X_test)[:, 1] # 0의 값을 기준으로 분포
precision_dt, recall_dt, thresholds_dt = precision_recall_curve(y_test, pred_tree)
plt.plot(precision, recall, label="svc")
plt.plot(precision_rf, recall_rf, label="랜덤포레스트")
plt.plot(precision_dt, recall_dt, label="의사결정트리")
plt.title("정밀도-재현율 곡선")
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x2346e215880>
from sklearn.metrics import f1_score
rf_f1score = f1_score(y_test, rf.predict(X_test) )
svc_f1score = f1_score(y_test, svc.predict(X_test))
tree_f1score = f1_score(y_test, tree.predict(X_test))
print("랜덤 포레스트의 f1_score: {:.3f}".format(rf_f1score))
print("svc의 f1_score: {:.3f}".format(svc_f1score))
print("의사결정트리의 f1_score: {:.3f}".format(tree_f1score))
랜덤 포레스트의 f1_score: 0.657 svc의 f1_score: 0.626 의사결정트리의 f1_score: 0.667
from sklearn.metrics import average_precision_score
## 확률 예측
rf_pro = rf.predict_proba(X_test)[:, 1]
df_pro = tree.predict_proba(X_test)[:, 1]
svc_dcfun = svc.decision_function(X_test)
ap_rf = average_precision_score(y_test, rf_pro)
ap_df = average_precision_score(y_test, df_pro)
ap_svc = average_precision_score(y_test, svc_dcfun)
print("랜덤 포레스트의 평균 정밀도: {:.3f}".format(ap_rf))
print("의사결정트리의 평균 정밀도: {:.3f}".format(ap_df))
print("svc의 평균 정밀도: {:.3f}".format(ap_svc))
랜덤 포레스트의 평균 정밀도: 0.780 의사결정트리의 평균 정밀도: 0.600 svc의 평균 정밀도: 0.669
from sklearn.metrics import roc_curve
fpr_svc, tpr_svc, thresholds_svc = roc_curve(y_test, svc.decision_function(X_test) )
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1] )
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, tree.predict_proba(X_test)[:, 1] )
plt.plot(fpr_rf, tpr_rf, label="랜덤포레스트 ROC 곡선")
plt.plot(fpr_svc, tpr_svc, label="SVM ROC 곡선")
plt.plot(fpr_dt, tpr_dt, label="의사결정트리 ROC 곡선")
plt.xlabel("FPrate, 1-특이도")
plt.ylabel("TPrate(재현율, 민감도)")
plt.legend(loc=4)
# 랜덤포레스트 임계값이 0.7일때의 위치
close_05_rf = np.argmin(np.abs(thresholds_rf - 0.7))
plt.plot(fpr_rf[close_05_rf], tpr_rf[close_05_rf], '^', label="RF 임계값 0.7")
# 랜덤포레스트 임계값이 0.3일때의 위치
close_05_rf = np.argmin(np.abs(thresholds_rf - 0.3))
plt.plot(fpr_rf[close_05_rf], tpr_rf[close_05_rf], 'v', label="RF 임계값 0.3")
[<matplotlib.lines.Line2D at 0x2346f80b7f0>]
from sklearn.metrics import roc_auc_score
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
df_auc = roc_auc_score(y_test, tree.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
print("랜덤 포레스트의 AUC: {:.3f}".format(rf_auc))
print("의사결정트리의 AUC: {:.3f}".format(df_auc))
print("SVC의 AUC: {:.3f}".format(svc_auc))
랜덤 포레스트의 AUC: 0.796 의사결정트리의 AUC: 0.738 SVC의 AUC: 0.773
평균 정밀도 값은 랜덤 포레스트가 가장 좋습니다.
AUC의 값이 1에 가장 가까운 모델은 0.797으로 랜덤 포레스트 모델입니다.
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.