import matplotlib
from matplotlib import font_manager, rc
import platform
import warnings
warnings.filterwarnings(action='ignore')
### 한글
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
matplotlib.rcParams['axes.unicode_minus'] = False
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report
from mglearn.datasets import make_blobs
X, y = make_blobs(n_samples=(400, 50),
centers=2, cluster_std=[7.0, 2],
random_state=22)
print(X.shape, y.shape)
(450, 2) (450,)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(gamma=.05).fit(X_train, y_train)
tree = DecisionTreeClassifier().fit(X_train, y_train)
# SVC 모델
pred = svc.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.97 0.89 0.93 104 1 0.35 0.67 0.46 9 accuracy 0.88 113 macro avg 0.66 0.78 0.70 113 weighted avg 0.92 0.88 0.89 113
# 의사결정 트리 모델
pred = tree.predict(X_test)
print(classification_report(y_test, pred))
precision recall f1-score support 0 0.98 0.88 0.92 104 1 0.35 0.78 0.48 9 accuracy 0.87 113 macro avg 0.66 0.83 0.70 113 weighted avg 0.93 0.87 0.89 113
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test,
svc.decision_function(X_test))
# 부드러운 곡선을 위해 데이터 포인트 수를 늘립니다
X, y = make_blobs(n_samples=(4000, 500),
centers=2,
cluster_std=[7.0, 2],
random_state=22)
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
(4500, 2) (4500,)
plt.scatter(X[:,0], X[:,1],
c=y,
cmap=plt.cm.autumn, # plt.cm.Blues, RdYlGn, BrBG, Greens, RdGy, YlOrRd, autumn
s=60, edgecolors='k')
<matplotlib.collections.PathCollection at 0x7fa7e3aa5eb0>
svc = SVC(gamma=.05).fit(X_train, y_train)
pred = svc.decision_function(X_test) # 0의 값을 기준으로 분포
print(pred[0:10])
[-1.09425577 -1.10667545 -1.10736997 -1.19140534 -1.22918652 -1.19749983 -1.17059048 -1.24720063 -1.19551381 -1.11214569]
precision, recall, thresholds = precision_recall_curve(y_test, pred)
print("임계값 : ", thresholds.min(), thresholds.max())
# 0에 가까운 임계값을 찾습니다
close_zero = np.argmin(np.abs(thresholds)) # thresholds의 절대값이 가장 작은 것(위치)
print(close_zero)
plt.plot(precision[close_zero],
recall[close_zero], 'o',
markersize=10,
label="임계값 0",
fillstyle="none", c='k')
plt.plot(precision, recall, label="정밀도-재현율 곡선")
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")
임계값 : -1.5528391475651482 1.4968799824567065 983
<matplotlib.legend.Legend at 0x7fa7e36bcbb0>
X_train.shape # 현재 feature 개수
(3375, 2)
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_test)[:, 1]
pred
array([0. , 0. , 0. , ..., 0. , 0.66, 0. ])
# SVC모델 그래프
plt.plot(precision, recall, label="svc")
plt.plot(precision[close_zero],
recall[close_zero], 'o',
markersize=10,
label="svc: 임계값 0",
fillstyle="none",
c='k',
mew=2)
# RandomForestClassifier는 decision_function 대신 predict_proba를 제공합니다.
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, pred)
# 랜덤포레스트 그래프
plt.plot(precision_rf, recall_rf, label="rf")
close_zero_rf = np.argmin( np.abs(thresholds_rf - 0.5) ) # 임계값이 0.5 위치
print(close_zero_rf)
plt.plot(precision_rf[close_zero_rf], recall_rf[close_zero_rf], '^', c='k',
markersize=10, label="rf: 임계값 0.5", fillstyle="none", mew=2)
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")
45
<matplotlib.legend.Legend at 0x7fa7e3d5e580>
from sklearn.metrics import f1_score
rf_f1score = f1_score(y_test, rf.predict(X_test) )
svc_f1score = f1_score(y_test, svc.predict(X_test))
print("랜덤 포레스트의 f1_score: {:.3f}".format(rf_f1score))
print("svc의 f1_score: {:.3f}".format(svc_f1score))
랜덤 포레스트의 f1_score: 0.573 svc의 f1_score: 0.661
from sklearn.metrics import average_precision_score
## 확률 예측
rf_pro = rf.predict_proba(X_test)[:, 1]
svc_dcfun = svc.decision_function(X_test)
ap_rf = average_precision_score(y_test, rf_pro)
ap_svc = average_precision_score(y_test, svc_dcfun)
print("랜덤 포레스트의 평균 정밀도: {:.3f}".format(ap_rf))
print("svc의 평균 정밀도: {:.3f}".format(ap_svc))
랜덤 포레스트의 평균 정밀도: 0.608 svc의 평균 정밀도: 0.632
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))
fpr.shape, tpr.shape, thresholds
((120,), (120,), array([ 2.49687998, 1.49687998, 1.47581867, 1.4610055 , 1.45802113, 1.44126949, 1.39147826, 1.37380751, 1.35378001, 1.35115809, 1.23928869, 1.22953332, 1.21983036, 1.20198557, 1.18379643, 1.16423601, 1.03010903, 0.96014974, 0.95503679, 0.9238038 , 0.91992255, 0.91412964, 0.91086392, 0.91041837, 0.86674391, 0.85989618, 0.85955523, 0.77050185, 0.76271236, 0.76034868, 0.74948191, 0.74083693, 0.67295553, 0.6313474 , 0.59612443, 0.56820147, 0.5421393 , 0.52460529, 0.51559663, 0.50719384, 0.37819992, 0.35974527, 0.35286999, 0.35078072, 0.33633912, 0.28457478, 0.28263032, 0.262315 , 0.22681224, 0.22489642, 0.20402619, 0.19841834, 0.1535657 , 0.13174612, 0.11071806, 0.06216232, 0.05418311, 0.05175504, 0.02982469, -0.01082579, -0.0441958 , -0.07263297, -0.14957616, -0.16204013, -0.22312698, -0.26617542, -0.28148442, -0.29640122, -0.33187486, -0.36364591, -0.37896216, -0.413147 , -0.41411063, -0.46562446, -0.47806116, -0.48767225, -0.5058179 , -0.51036328, -0.5110437 , -0.52644175, -0.53222803, -0.58481903, -0.58940514, -0.630053 , -0.65681141, -0.69947483, -0.70880328, -0.76236455, -0.76766451, -0.79133577, -0.80125093, -0.84469204, -0.84626488, -0.89704645, -0.89985197, -0.91717414, -0.93386616, -0.98339756, -0.98997147, -0.99198362, -0.99232122, -1.00442937, -1.00450048, -1.0275707 , -1.02764516, -1.05723133, -1.05811294, -1.06372408, -1.06460692, -1.09122745, -1.0915653 , -1.11441756, -1.11452809, -1.18688593, -1.18689626, -1.38761101, -1.3947979 , -1.53375669, -1.55283915, -1.77988356]))
plt.plot(fpr, tpr, label="ROC 곡선")
plt.xlabel("FPR")
plt.ylabel("TPR (재현율)")
# 임계값이 0 근처의 임계값을 찾습니다
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="임계값 0", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)
<matplotlib.legend.Legend at 0x7fa7e3fd5c40>
from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="SVC의 ROC 곡선")
plt.plot(fpr_rf, tpr_rf, label="RF의 ROC 곡선")
plt.xlabel("FPR")
plt.ylabel("TPR (재현율)")
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="SVC 임계값 0", fillstyle="none", c='k', mew=2)
close_05_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(fpr_rf[close_05_rf], tpr[close_05_rf], '^', markersize=10,
label="RF 임계값 0.5", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)
<matplotlib.legend.Legend at 0x7fa7e3ab8520>
from sklearn.metrics import roc_auc_score
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
print("랜덤 포레스트의 AUC: {:.3f}".format(rf_auc))
print("SVC의 AUC: {:.3f}".format(svc_auc))
랜덤 포레스트의 AUC: 0.933 SVC의 AUC: 0.925
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2022 LIM Co. all rights reserved.