import matplotlib
from matplotlib import font_manager, rc
import platform
import warnings
warnings.filterwarnings(action='ignore')


### 한글
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system()=="Darwin":
    rc('font', family='AppleGothic')
else:
    print("Unknown System")
    
    
matplotlib.rcParams['axes.unicode_minus'] = False


from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report


from mglearn.datasets import make_blobs 
X, y = make_blobs(n_samples=(400, 50), 
                  centers=2, cluster_std=[7.0, 2],        
                  random_state=22)
                     
print(X.shape, y.shape)

(450, 2) (450,)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(gamma=.05).fit(X_train, y_train)   
tree = DecisionTreeClassifier().fit(X_train, y_train)


# SVC 모델
pred = svc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93       104
           1       0.35      0.67      0.46         9

    accuracy                           0.88       113
   macro avg       0.66      0.78      0.70       113
weighted avg       0.92      0.88      0.89       113


# 의사결정 트리 모델
pred = tree.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.88      0.92       104
           1       0.35      0.78      0.48         9

    accuracy                           0.87       113
   macro avg       0.66      0.83      0.70       113
weighted avg       0.93      0.87      0.89       113


from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, 
                                                       svc.decision_function(X_test))


# 부드러운 곡선을 위해 데이터 포인트 수를 늘립니다
X, y = make_blobs(n_samples=(4000, 500), 
                  centers=2, 
                  cluster_std=[7.0, 2],
                  random_state=22)

print(X.shape, y.shape)


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

(4500, 2) (4500,)


plt.scatter(X[:,0], X[:,1], 
            c=y, 
            cmap=plt.cm.autumn,  # plt.cm.Blues, RdYlGn, BrBG, Greens, RdGy, YlOrRd, autumn
            s=60, edgecolors='k')

<matplotlib.collections.PathCollection at 0x7fa7e3aa5eb0>


svc = SVC(gamma=.05).fit(X_train, y_train)

pred = svc.decision_function(X_test)  #  0의 값을 기준으로 분포
print(pred[0:10])

[-1.09425577 -1.10667545 -1.10736997 -1.19140534 -1.22918652 -1.19749983
 -1.17059048 -1.24720063 -1.19551381 -1.11214569]


precision, recall, thresholds = precision_recall_curve(y_test, pred)

print("임계값 : ", thresholds.min(),  thresholds.max())

# 0에 가까운 임계값을 찾습니다
close_zero = np.argmin(np.abs(thresholds))  # thresholds의 절대값이 가장 작은 것(위치)
print(close_zero)  

plt.plot(precision[close_zero], 
         recall[close_zero], 'o', 
         markersize=10,
         label="임계값 0", 
         fillstyle="none", c='k')

plt.plot(precision, recall, label="정밀도-재현율 곡선")
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")

임계값 :  -1.5528391475651482 1.4968799824567065
983

<matplotlib.legend.Legend at 0x7fa7e36bcbb0>


X_train.shape  # 현재 feature 개수

(3375, 2)


from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
rf.fit(X_train, y_train)
pred =  rf.predict_proba(X_test)[:, 1]
pred

array([0.  , 0.  , 0.  , ..., 0.  , 0.66, 0.  ])


# SVC모델 그래프
plt.plot(precision, recall, label="svc")

plt.plot(precision[close_zero], 
         recall[close_zero], 'o', 
         markersize=10,
         label="svc: 임계값 0", 
         fillstyle="none", 
         c='k', 
         mew=2)

# RandomForestClassifier는 decision_function 대신 predict_proba를 제공합니다.
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, pred)

# 랜덤포레스트 그래프
plt.plot(precision_rf, recall_rf, label="rf")

close_zero_rf = np.argmin(  np.abs(thresholds_rf - 0.5)  )   # 임계값이 0.5 위치
print(close_zero_rf)

plt.plot(precision_rf[close_zero_rf], recall_rf[close_zero_rf], '^', c='k',
         markersize=10, label="rf: 임계값 0.5", fillstyle="none", mew=2)
plt.xlabel("정밀도")
plt.ylabel("재현율")
plt.legend(loc="best")

45

<matplotlib.legend.Legend at 0x7fa7e3d5e580>


from sklearn.metrics import f1_score

rf_f1score = f1_score(y_test, rf.predict(X_test) )
svc_f1score = f1_score(y_test, svc.predict(X_test))

print("랜덤 포레스트의 f1_score: {:.3f}".format(rf_f1score))
print("svc의 f1_score: {:.3f}".format(svc_f1score))

랜덤 포레스트의 f1_score: 0.573
svc의 f1_score: 0.661


from sklearn.metrics import average_precision_score

## 확률 예측 
rf_pro = rf.predict_proba(X_test)[:, 1]
svc_dcfun = svc.decision_function(X_test)

ap_rf = average_precision_score(y_test, rf_pro)
ap_svc = average_precision_score(y_test, svc_dcfun)

print("랜덤 포레스트의 평균 정밀도: {:.3f}".format(ap_rf))
print("svc의 평균 정밀도: {:.3f}".format(ap_svc))

랜덤 포레스트의 평균 정밀도: 0.608
svc의 평균 정밀도: 0.632


from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))


fpr.shape, tpr.shape, thresholds

((120,),
 (120,),
 array([ 2.49687998,  1.49687998,  1.47581867,  1.4610055 ,  1.45802113,
         1.44126949,  1.39147826,  1.37380751,  1.35378001,  1.35115809,
         1.23928869,  1.22953332,  1.21983036,  1.20198557,  1.18379643,
         1.16423601,  1.03010903,  0.96014974,  0.95503679,  0.9238038 ,
         0.91992255,  0.91412964,  0.91086392,  0.91041837,  0.86674391,
         0.85989618,  0.85955523,  0.77050185,  0.76271236,  0.76034868,
         0.74948191,  0.74083693,  0.67295553,  0.6313474 ,  0.59612443,
         0.56820147,  0.5421393 ,  0.52460529,  0.51559663,  0.50719384,
         0.37819992,  0.35974527,  0.35286999,  0.35078072,  0.33633912,
         0.28457478,  0.28263032,  0.262315  ,  0.22681224,  0.22489642,
         0.20402619,  0.19841834,  0.1535657 ,  0.13174612,  0.11071806,
         0.06216232,  0.05418311,  0.05175504,  0.02982469, -0.01082579,
        -0.0441958 , -0.07263297, -0.14957616, -0.16204013, -0.22312698,
        -0.26617542, -0.28148442, -0.29640122, -0.33187486, -0.36364591,
        -0.37896216, -0.413147  , -0.41411063, -0.46562446, -0.47806116,
        -0.48767225, -0.5058179 , -0.51036328, -0.5110437 , -0.52644175,
        -0.53222803, -0.58481903, -0.58940514, -0.630053  , -0.65681141,
        -0.69947483, -0.70880328, -0.76236455, -0.76766451, -0.79133577,
        -0.80125093, -0.84469204, -0.84626488, -0.89704645, -0.89985197,
        -0.91717414, -0.93386616, -0.98339756, -0.98997147, -0.99198362,
        -0.99232122, -1.00442937, -1.00450048, -1.0275707 , -1.02764516,
        -1.05723133, -1.05811294, -1.06372408, -1.06460692, -1.09122745,
        -1.0915653 , -1.11441756, -1.11452809, -1.18688593, -1.18689626,
        -1.38761101, -1.3947979 , -1.53375669, -1.55283915, -1.77988356]))


plt.plot(fpr, tpr, label="ROC 곡선")
plt.xlabel("FPR")
plt.ylabel("TPR (재현율)")

# 임계값이 0 근처의 임계값을 찾습니다
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="임계값 0", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)

<matplotlib.legend.Legend at 0x7fa7e3fd5c40>


from sklearn.metrics import roc_curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])

plt.plot(fpr, tpr, label="SVC의 ROC 곡선")
plt.plot(fpr_rf, tpr_rf, label="RF의 ROC 곡선")

plt.xlabel("FPR")
plt.ylabel("TPR (재현율)")
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
         label="SVC 임계값 0", fillstyle="none", c='k', mew=2)

close_05_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(fpr_rf[close_05_rf], tpr[close_05_rf], '^', markersize=10,
         label="RF 임계값 0.5", fillstyle="none", c='k', mew=2)

plt.legend(loc=4)

<matplotlib.legend.Legend at 0x7fa7e3ab8520>


from sklearn.metrics import roc_auc_score
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))

print("랜덤 포레스트의 AUC: {:.3f}".format(rf_auc))
print("SVC의 AUC: {:.3f}".format(svc_auc))

랜덤 포레스트의 AUC: 0.933
SVC의 AUC: 0.925

평가지표 알아보기 - ROC 커브, AUC¶

1.1.1 이진 분류의 평가지표¶

1.1.2 임계값과 평가지표¶

1.1.3 평가지표 - ROC 커브, AUC¶

1.1.4 다중 분류의 평가지표¶

학습 내용¶

목차

한글 설정¶

01. 데이터 준비 및 라이브러리 임포트

정밀도 재현율 곡선을 이용하여 성능을 판단해 보기¶

02. 정밀도-재현율 곡선 확인

정밀도(x)와 재현율(y) - ROC 커브 확인해 보기¶

정밀도-재현율 곡선 확인¶

정밀도¶

재현율(recall, 민감도, TPR)¶

가짜 양성 비율(Fprate)¶

03. 정밀도-재현율의 커브(랜덤 포레스트 모델)

SVC모델과 RandomForest모델의 비교¶

(실습) 의사결정트리 모델 추가해 보기¶

04. 평균 정밀도 확인

모델을 비교하기 위한 방법 중의 하나로 정밀도-재현율 곡선의 아랫부분 면적을 이용¶

05. ROC 곡선과 AUC 알아보기

ROC 곡선¶

ROC 와 AUC¶

임계값에 따른 각각의 Fprate, Tprate를 구하기¶

ROC곡선은 왼쪽 위쪽에 가까울 수록 성능이 좋은 것을 나타낸다.¶

REF¶